diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5bb0e3648d2ec..421fc429048ff 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5324,6 +5324,12 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
   unsigned CPol = Inst.getOperand(CPolPos).getImm();
 
   if (!isGFX1250()) {
+    if (CPol & CPol::SCAL) {
+      SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+      StringRef CStr(S.getPointer());
+      S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+      Error(S, "scale_offset is not supported on this GPU");
+    }
     if (CPol & CPol::NV) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
@@ -5332,6 +5338,13 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
     }
   }
 
+  if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) {
+    SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+    StringRef CStr(S.getPointer());
+    S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+    Error(S, "scale_offset is not supported for this instruction");
+  }
+
   if (isGFX12Plus())
     return validateTHAndScopeBits(Inst, Operands, CPol);
 
@@ -7003,6 +7016,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
     ParseStatus ResTH = ParseStatus::NoMatch;
     ParseStatus ResScope = ParseStatus::NoMatch;
     ParseStatus ResNV = ParseStatus::NoMatch;
+    ParseStatus ResScal = ParseStatus::NoMatch;
 
     for (;;) {
       if (ResTH.isNoMatch()) {
@@ -7041,10 +7055,22 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
         }
       }
 
+      if (ResScal.isNoMatch()) {
+        if (trySkipId("scale_offset")) {
+          ResScal = ParseStatus::Success;
+          CPolVal |= CPol::SCAL;
+          continue;
+        } else if (trySkipId("no", "scale_offset")) {
+          ResScal = ParseStatus::Success;
+          continue;
+        }
+      }
+
       break;
     }
 
-    if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch())
+    if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() &&
+        ResScal.isNoMatch())
       return ParseStatus::NoMatch;
 
     Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f7f29f17f9d0e..1cc717bb437de 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2941,6 +2941,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
     let DecoderNamespace = "GFX12";
 
     let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+    let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
   }
 }
 
@@ -3170,6 +3171,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
     let DecoderNamespace = "GFX1250";
 
     let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+    let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 268162bcada47..cbc517d3da680 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1162,6 +1162,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
 
+  // Scalar and global loads support scale_offset bit.
+  bool hasScaleOffset() const { return GFX1250Insts; }
+
   bool hasFlatGVSMode() const { return FlatGVSMode; }
 
   bool enableSIScheduler() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 197bb3f0b569b..11b072e05a6e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -157,6 +157,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     const int64_t TH = Imm & CPol::TH;
     const int64_t Scope = Imm & CPol::SCOPE;
 
+    if (Imm & CPol::SCAL)
+      O << " scale_offset";
+
     printTH(MI, TH, Scope, O);
     printScope(Scope, O);
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index d3790880532a8..3902d4c3b1027 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -402,6 +402,8 @@ enum CPol {
 
   SWZ = 1 << 6, // Swizzle bit
 
+  SCAL = 1 << 11, // Scale offset bit
+
   ALL = TH | SCOPE,
 
   // Helper bits
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e172c0b63189b..e5d1eaad2b8f4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand(
         return;
     }
 
-    // A frame index will resolve to a positive constant, so it should always be
-    // safe to fold the addressing mode, even pre-GFX9.
-    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
-
     const unsigned Opc = UseMI->getOpcode();
     if (TII->isFLATScratch(*UseMI) &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
         !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
       unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
+      unsigned CPol =
+          TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
+      if ((CPol & AMDGPU::CPol::SCAL) &&
+          !AMDGPU::supportsScaleOffset(*TII, NewOpc))
+        return;
+
       UseMI->setDesc(TII->get(NewOpc));
     }
 
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
+
     return;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 6b419347c01d9..89d9b0d32b25b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -318,6 +318,7 @@ def CPolBit {
   int DLC = 2;
   int SCC = 4;
   int NV = 5;
+  int SCAL = 11;
 }
 
 class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e2a2525d909bd..571f3efd68260 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5482,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+    if (CPol->getImm() & AMDGPU::CPol::SCAL) {
+      if (!ST.hasScaleOffset()) {
+        ErrInfo = "Subtarget does not support offset scaling";
+        return false;
+      }
+      if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
+        ErrInfo = "Instruction does not support offset scaling";
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 5097ac03954d5..b49c5a997af78 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -61,6 +61,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
     if (EltOffset0 + CI.Width != EltOffset1 &&
             EltOffset1 + Paired.Width != EltOffset0)
       return false;
-    if (CI.CPol != Paired.CPol)
+    // Instructions with scale_offset modifier cannot be combined unless we
+    // also generate a code to scale the offset and reset that bit.
+    if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
       return false;
     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index d8b52d271a964..0850c41c933de 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1488,6 +1488,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
   let Inst{20} = cpol{CPolBit.NV}; // non-volatile
   let Inst{22-21} = cpol{4-3}; // scope
   let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+  let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
 }
 
 multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9c6c3746f9007..b5b3cc97569ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3228,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
+  uint64_t TSFlags = MII.get(Opcode).TSFlags;
+
+  if (TSFlags & SIInstrFlags::SMRD)
+    return !getSMEMIsBuffer(Opcode);
+  if (!(TSFlags & SIInstrFlags::FLAT))
+    return false;
+
+  // Only SV and SVS modes are supported.
+  if (TSFlags & SIInstrFlags::FlatScratch)
+    return hasNamedOperand(Opcode, OpName::vaddr);
+
+  // Only GVS mode is supported.
+  return hasNamedOperand(Opcode, OpName::vaddr) &&
+         hasNamedOperand(Opcode, OpName::saddr);
+
+  return false;
+}
+
 bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
   for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
     int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index bde951b43d105..c09a9d694f3d8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1757,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 /// \returns true if the intrinsic is uniform
 bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
+/// \returns true if a memory instruction supports scale_offset modifier.
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
+
 /// \returns lds block size in terms of dwords. \p
 /// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
 /// must be defined in terms of bytes.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
new file mode 100644
index 0000000000000..e5955ad3d1d49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=si-fold-operands -stop-after=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            test_fold_fi_scratch_load_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr
+    ; GCN: renamable $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    S_ENDPGM 0, implicit %1
+
+...
+
+# SS form of the SCRATCH_LOAD_DWORD does not support offset scaling
+
+---
+name:            test_no_fold_fi_scratch_load_vgpr_scale_offset
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: test_no_fold_fi_scratch_load_vgpr_scale_offset
+    ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $sgpr32, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 4, 2048, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 2048, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
new file mode 100644
index 0000000000000..76e2092c8b57a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
@@ -0,0 +1,104 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            merge_global_load_dword_2_no_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_2_same_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_2_different_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms
+# of S_LOAD, but the check stays the same: these cannot be merged with different
+# scale offsets.
+#
+# We also do not currently merge flat scratch instructions, although a common
+# check in the merge logic that CPol shall not be set for merge to happen.
+
+---
+name: merge_s_load_x1_x1_imm_no_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_same_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32))
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_different_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
index 899c4c7aca0ba..800f66221114e 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
@@ -12,3 +12,30 @@ s_buffer_load_i8 s5, s[4:7], s0 nv
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
 // GFX12-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 nv
 // GFX12-ERR-NEXT:{{^}}                                ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX12-ERR-NEXT:{{^}}                           ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}}                           ^
+// GFX12-ERR-NEXT: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}}                                        ^
+
+s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                    ^
+
+s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                    ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
new file mode 100644
index 0000000000000..e57d4fc760309
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                ^
+
+s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR-NEXT:{{^}}                                      ^
+
+s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                      ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
new file mode 100644
index 0000000000000..731eb6770ceed
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                                      ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
index 488040e1b5390..d3a49f2eb25fa 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
@@ -61,6 +61,54 @@ scratch_load_b32 v5, v2, off nv
 // GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off nv
 // GFX12-ERR-NEXT:{{^}}                             ^
 
+global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                         ^
+
+global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                          ^
+
+global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX12-ERR-NEXT:{{^}}                                     ^
+
+scratch_load_b32 v5, v2, off scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off scale_offset
+// GFX12-ERR-NEXT:{{^}}                             ^
+
+scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                       ^
+
+scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                      ^
+
+scratch_store_b32 v2, v5, off scale_offset
+// GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, off scale_offset
+// GFX12-ERR-NEXT:{{^}}                              ^
+
+scratch_store_b32 v2, v5, s1 scale_offset
+// GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, s1 scale_offset
+// GFX12-ERR-NEXT:{{^}}                             ^
+
 tensor_save s[0:1]
 // GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -81,10 +129,18 @@ tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS
 // GFX1250: tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1b,0xee,0x00,0x00,0x3c,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_add_f32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_add_u32 v2, v3, s[2:3] offset:-64
 // GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -93,6 +149,14 @@ flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_and_b32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -101,18 +165,38 @@ flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
 flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3]
 // GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_dec_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -121,6 +205,14 @@ flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_inc_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -129,10 +221,22 @@ flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_max_i32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -141,6 +245,14 @@ flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_max_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -149,10 +261,22 @@ flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_min_i32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -161,6 +285,14 @@ flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_min_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -169,6 +301,14 @@ flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_or_b32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -177,10 +317,22 @@ flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_sub_u32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -189,6 +341,14 @@ flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_swap_b32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -197,6 +357,14 @@ flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_xor_b32 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
@@ -205,10 +373,114 @@ flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64
 // GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 
+flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
 flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64
 // GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_dword v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
index 86a63bcb4be89..26d7ed3cfae4c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
@@ -2,3 +2,58 @@
 
 global_load_b96 v[1:3], v[0:1], off
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                         ^
+
+flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                   ^
+
+flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                          ^
+
+flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                           ^
+
+global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                          ^
+
+global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                           ^
+
+global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                  ^
+
+global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                  ^
+
+global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                   ^
+
+scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                             ^
+
+scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                        ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
index 4bd9ab45af6bd..92fa802802f29 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
@@ -5,3 +5,15 @@
 
 # GFX1250: s_load_b32 s4, s[2:3], 0xa nv           ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8]
 0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb
+
+# GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
index fcbb58bf7865e..3455f4c3b46e9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
@@ -1,104 +1,269 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+# GFX1250: flat_atomic_add_f32 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
 # GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_add_u32 v0, v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff
+
 # GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
 0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff
 
 # GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00
+
 # GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
 0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
 
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_dec_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_max_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_min_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
 # GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_pk_add_f16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
 # GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_sub_clamp_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00
+
 # GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
 0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
 
 # GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
 0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
 
+# GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
 # GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
 0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
 
@@ -2856,6 +3021,30 @@
 # GFX1250: scratch_load_b32 v5, v2, off nv         ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
 0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00
 
+# GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00
+
+# GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
 # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00