[AMDGPU] Extend zero initialization of return values for TFE #85759

dstutt · 2024-03-19T09:46:43Z

buffer_load instructions that use TFE also need to zero initialize return values similar to how the image instructions currently work. Add support for this with standard zero init of all results + zero init of just TFE flag when enable-prt-strict-null subtarget feature is disabled.

llvmbot · 2024-03-19T09:47:18Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: David Stuttard (dstutt)

Changes

buffer_load instructions that use TFE also need to zero initialize return values similar to how the image instructions currently work. Add support for this with standard zero init of all results + zero init of just TFE flag when enable-prt-strict-null subtarget feature is disabled.

Patch is 78.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/85759.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+7-1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+44-37)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1-1)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+6)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+3)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll (+444-69)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll (+280)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 4ae514ffcf7850..3078e56b54b834 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -119,6 +119,7 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let hasPostISelHook = 1;
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
@@ -323,6 +324,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
   let MUBUF = 1;
   let AsmMatchConverter = "cvtMubuf";
+  let hasPostISelHook = 1;
 }
 
 class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
@@ -330,6 +332,7 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let hasPostISelHook = 1;
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
@@ -390,6 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let has_slc     = 0;
   let has_sccb    = 0;
   let sccb_value  = 0;
+  let tfe         = 0;
 }
 
 class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
@@ -675,6 +679,7 @@ class MUBUF_Pseudo_Store_Lds<string opName>
   let has_vaddr = 0;
   let lds = 1;
   let VALU = 1;
+  let tfe = 0;
 
   let Uses = [EXEC, M0];
   let AsmMatchConverter = "cvtMubuf";
@@ -733,6 +738,7 @@ class MUBUF_Atomic_Pseudo<string opName,
   let has_glc = 0;
   let has_dlc = 0;
   let has_sccb = 1;
+  let tfe = 0;
   let AsmMatchConverter = "cvtMubufAtomic";
 }
 
@@ -3369,7 +3375,7 @@ def MUBUFInfoTable : GenericTable {
   let CppTypeName = "MUBUFInfo";
   let Fields = [
     "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset",
-    "IsBufferInv"
+    "IsBufferInv", "tfe"
   ];
 
   let PrimaryKey = ["Opcode"];
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5ccf21f76015de..c1cfa39368f479 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15033,57 +15033,63 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 // result register that will be written in the case of a memory access failure.
 // The required code is also added to tie this init code to the result of the
 // img instruction.
-void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
+void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   MachineBasicBlock &MBB = *MI.getParent();
 
-  MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
-  MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
-  MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+  int DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+  unsigned InitIdx = 0;
 
-  if (!TFE && !LWE) // intersect_ray
-    return;
-
-  unsigned TFEVal = TFE ? TFE->getImm() : 0;
-  unsigned LWEVal = LWE ? LWE->getImm() : 0;
-  unsigned D16Val = D16 ? D16->getImm() : 0;
+  if (TII->isImage(MI)) {
+    MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+    MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+    MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
 
-  if (!TFEVal && !LWEVal)
-    return;
+    if (!TFE && !LWE) // intersect_ray
+      return;
 
-  // At least one of TFE or LWE are non-zero
-  // We have to insert a suitable initialization of the result value and
-  // tie this to the dest of the image instruction.
+    unsigned TFEVal = TFE ? TFE->getImm() : 0;
+    unsigned LWEVal = LWE ? LWE->getImm() : 0;
+    unsigned D16Val = D16 ? D16->getImm() : 0;
 
-  const DebugLoc &DL = MI.getDebugLoc();
+    if (!TFEVal && !LWEVal)
+      return;
 
-  int DstIdx =
-      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+    // At least one of TFE or LWE are non-zero
+    // We have to insert a suitable initialization of the result value and
+    // tie this to the dest of the image instruction.
 
-  // Calculate which dword we have to initialize to 0.
-  MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+    // Calculate which dword we have to initialize to 0.
+    MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
 
-  // check that dmask operand is found.
-  assert(MO_Dmask && "Expected dmask operand in instruction");
+    // check that dmask operand is found.
+    assert(MO_Dmask && "Expected dmask operand in instruction");
 
-  unsigned dmask = MO_Dmask->getImm();
-  // Determine the number of active lanes taking into account the
-  // Gather4 special case
-  unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
+    unsigned dmask = MO_Dmask->getImm();
+    // Determine the number of active lanes taking into account the
+    // Gather4 special case
+    unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
 
-  bool Packed = !Subtarget->hasUnpackedD16VMem();
+    bool Packed = !Subtarget->hasUnpackedD16VMem();
 
-  unsigned InitIdx =
-      D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+    InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
 
-  // Abandon attempt if the dst size isn't large enough
-  // - this is in fact an error but this is picked up elsewhere and
-  // reported correctly.
-  uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
-  if (DstSize < InitIdx)
+    // Abandon attempt if the dst size isn't large enough
+    // - this is in fact an error but this is picked up elsewhere and
+    // reported correctly.
+    uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    if (DstSize < InitIdx)
+      return;
+  } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFHasTFE(MI.getOpcode())) {
+    uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    InitIdx = DstSize;
+  } else {
     return;
+  }
+
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // Create a register for the initialization value.
   Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
@@ -15184,11 +15190,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     return;
   }
 
-  if (TII->isImage(MI)) {
+  if (TII->isImage(MI) || TII->isMUBUF(MI)) {
     if (!MI.mayStore())
-      AddIMGInit(MI);
-    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
+      AddMemOpInit(MI);
   }
+  if (TII->isImage(MI))
+    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
 }
 
 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 89da4428e3ab0a..9856a2923d38f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -466,7 +466,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-  void AddIMGInit(MachineInstr &MI) const;
+  void AddMemOpInit(MachineInstr &MI) const;
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d53f68ace70df..5e89b3fdaf032d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -318,6 +318,7 @@ struct MUBUFInfo {
   bool has_srsrc;
   bool has_soffset;
   bool IsBufferInv;
+  bool tfe;
 };
 
 struct MTBUFInfo {
@@ -466,6 +467,11 @@ bool getMUBUFIsBufferInv(unsigned Opc) {
   return Info ? Info->IsBufferInv : false;
 }
 
+bool getMUBUFHasTFE(unsigned Opc) {
+  const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+  return Info ? Info->tfe : false;
+}
+
 bool getSMEMIsBuffer(unsigned Opc) {
   const SMInfo *Info = getSMEMOpcodeHelper(Opc);
   return Info ? Info->IsBuffer : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 29ac402d953513..a8a4736b8538c1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -525,6 +525,9 @@ bool getMUBUFHasSoffset(unsigned Opc);
 LLVM_READONLY
 bool getMUBUFIsBufferInv(unsigned Opc);
 
+LLVM_READONLY
+bool getMUBUFHasTFE(unsigned Opc);
+
 LLVM_READONLY
 bool getSMEMIsBuffer(unsigned Opc);
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index 00be32b06de058..286545d479d2ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -2,6 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
 ;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
 
@@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_clause 0x2
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
@@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_immoffs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 ; GFX11-NEXT:    v_add_f32_e32 v2, v10, v2
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_immoffs_large:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v8, 0
+; NOPRT-NEXT:    s_movk_i32 s4, 0x7ffc
+; NOPRT-NEXT:    s_clause 0x1
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092
+; NOPRT-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092
+; NOPRT-NEXT:    s_mov_b32 s4, 0x8ffc
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_add_f32_e32 v1, v1, v5
+; NOPRT-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
+; NOPRT-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
+; NOPRT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; NOPRT-NEXT:    v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
+; NOPRT-NEXT:    v_add_f32_e32 v2, v10, v2
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_immoffs_large:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
@@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_12bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_13bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_16bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_23bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_voffset_large_24bit:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0
@@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_idx:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_idx:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen
@@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_ofs:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_ofs_imm:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    s_mov_b32 s4, 0
+; NOPRT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_ofs_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_both:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_both:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen
@@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_both_reversed:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v2, v0
+; NOPRT-NEXT:    buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_both_reversed:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
@@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_x:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_x:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
+; NOPRT-LABEL: buffer_load_x_i32:
+; NOPRT:       ; %bb.0: ; %main_body
+; NOPRT-NEXT:    v_mov_b32_e32 v0, 0
+; NOPRT-NEXT:    buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT:    s_waitcnt vmcnt(0)
+; NOPRT-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: buffer_load_x_i32:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_lo...
[truncated]

github-actions · 2024-03-19T09:49:15Z

✅ With the latest revision this PR passed the C/C++ code formatter.

dstutt · 2024-03-19T09:52:39Z

There's no global isel support implemented yet - but I thought that could maybe be a separate change? (I'm not sure about the best approach currently).

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

arsenm · 2024-03-19T10:18:23Z

There's no global isel support implemented yet - but I thought that could maybe be a separate change? (I'm not sure about the best approach currently).

I think it's best to do them together, otherwise they bifurcate

dstutt · 2024-03-19T10:29:35Z

There's no global isel support implemented yet - but I thought that could maybe be a separate change? (I'm not sure about the best approach currently).

I think it's best to do them together, otherwise they bifurcate

Yes, I thought you might say that :)
I'll take another look.
Any suggestions? Image support is done in AMDGPUInstructionSelector.cpp:selectImageIntrinsic - so not appropriate for the buffer instructions - but it seems like overkill to implement a selectBufferLoadTfe when selectImpl does a good job already (but maybe that's the best approach??)

arsenm · 2024-03-19T11:00:49Z

Yes, I thought you might say that :) I'll take another look. Any suggestions? Image support is done in AMDGPUInstructionSelector.cpp:selectImageIntrinsic - so not appropriate for the buffer instructions - but it seems like overkill to implement a selectBufferLoadTfe when selectImpl does a good job already (but maybe that's the best approach??)

The handling of all of these intrinsics is split between legalizeImageIntrinsic / legalizeBuffer* and the select* functions. Anything changing the register types used should be done in the legalizer

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

arsenm · 2024-03-21T11:24:10Z

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

+    return false;
+
+  // Retrieve the newly created instruction
+  MachineInstr &NewMI = *(std::prev(MII));


Extra parens

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

This means both DAG Isel and GlobalISel use the same code to zero the results for tfe instructions Additional test update due to discovery that tfe/lwe wasn't zeroing for image_msaa with previous implementation.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

jayfoad

Looks good overall.

llvm/lib/Target/AMDGPU/BUFInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll

jayfoad · 2024-03-22T12:20:36Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll

+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
+; GFX11-NEXT:    image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00]


Not your fault, but this code is terrible. It's zeroing v[8:12] and then copying that into v[0:4].

Yes, I agree - it is a bit horrible.

llvmbot added the backend:AMDGPU label Mar 19, 2024

clang format fixes

cd3e0d1

dstutt requested a review from jayfoad March 19, 2024 09:50

arsenm reviewed Mar 19, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Show resolved Hide resolved

dstutt commented Mar 19, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

Remove superfluous DstSize

b61f31e

kosarev reviewed Mar 19, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp Outdated Show resolved Hide resolved

Add support for GlobalISel

61763ce

llvmbot added the llvm:globalisel label Mar 20, 2024

dstutt requested a review from arsenm March 21, 2024 09:40

arsenm reviewed Mar 21, 2024

View reviewed changes

Use CustomerInserter approach

3a263fd

This means both DAG Isel and GlobalISel use the same code to zero the results for tfe instructions Additional test update due to discovery that tfe/lwe wasn't zeroing for image_msaa with previous implementation.

dstutt requested a review from arsenm March 22, 2024 08:32

arsenm approved these changes Mar 22, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

Clone register rather than create from RC

a2f37b5

jayfoad reviewed Mar 22, 2024

View reviewed changes

Minor fixups after review feedback

8a6d91e

dstutt requested a review from jayfoad March 22, 2024 13:36

jayfoad approved these changes Mar 22, 2024

View reviewed changes

dstutt merged commit 75e528f into llvm:main Mar 25, 2024
4 checks passed

dstutt deleted the llvm-buffer-tfe-init branch March 25, 2024 09:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Extend zero initialization of return values for TFE #85759

[AMDGPU] Extend zero initialization of return values for TFE #85759

dstutt commented Mar 19, 2024

llvmbot commented Mar 19, 2024 •

edited

github-actions bot commented Mar 19, 2024 •

edited

dstutt commented Mar 19, 2024

arsenm commented Mar 19, 2024

dstutt commented Mar 19, 2024

arsenm commented Mar 19, 2024

arsenm Mar 21, 2024

jayfoad left a comment

jayfoad Mar 22, 2024

dstutt Mar 22, 2024

[AMDGPU] Extend zero initialization of return values for TFE #85759

[AMDGPU] Extend zero initialization of return values for TFE #85759

Conversation

dstutt commented Mar 19, 2024

llvmbot commented Mar 19, 2024 • edited

github-actions bot commented Mar 19, 2024 • edited

dstutt commented Mar 19, 2024

arsenm commented Mar 19, 2024

dstutt commented Mar 19, 2024

arsenm commented Mar 19, 2024

arsenm Mar 21, 2024

Choose a reason for hiding this comment

jayfoad left a comment

Choose a reason for hiding this comment

jayfoad Mar 22, 2024

Choose a reason for hiding this comment

dstutt Mar 22, 2024

Choose a reason for hiding this comment

llvmbot commented Mar 19, 2024 •

edited

github-actions bot commented Mar 19, 2024 •

edited