diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 5fd0c1e1064cb..3fe063d540882 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -195,7 +195,8 @@ class SIWholeQuadMode { std::vector &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, std::vector &Worklist); - char scanInstructions(MachineFunction &MF, std::vector &Worklist); + char scanInstructions(MachineFunction &MF, std::vector &Worklist, + SmallVector &ExeczSideEffectInstrs); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); @@ -482,8 +483,9 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. -char SIWholeQuadMode::scanInstructions(MachineFunction &MF, - std::vector &Worklist) { +char SIWholeQuadMode::scanInstructions( + MachineFunction &MF, std::vector &Worklist, + SmallVector &ExeczSideEffectInstrs) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector SoftWQMInstrs; @@ -607,6 +609,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } } + if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) { + for (auto &Op : MI.uses()) { + if (!Op.isReg()) + continue; + if (!TRI->isVectorRegister(*MRI, Op.getReg())) + continue; + + ExeczSideEffectInstrs.push_back(&MI); + break; + } + } + if (Flags) { markInstruction(MI, Flags, Worklist); GlobalFlags |= Flags; @@ -715,7 +729,8 @@ void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { std::vector Worklist; - char GlobalFlags = scanInstructions(MF, Worklist); + SmallVector ExeczSideEffectInstrs; + char GlobalFlags = scanInstructions(MF, Worklist, ExeczSideEffectInstrs); while (!Worklist.empty()) { WorkItem WI = Worklist.back(); @@ -725,6 +740,22 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { propagateInstruction(*WI.MI, Worklist); else propagateBlock(*WI.MBB, Worklist); + + if (Worklist.empty()) { + // Currently we let the instructions having sideeffect when execz to run + // under wqm, this avoids unwanted side-effect with exact mode if only + // helper lanes execute the parent block. At the same time, the wqm + // property should be back-propagated along the data-flow of their sources + // to ensure their sources have correct data for helper lanes. + for (auto *MI : ExeczSideEffectInstrs) { + InstrInfo II = Instructions[MI]; + if (II.OutNeeds & StateWQM) + markInstructionUses(*MI, StateWQM, Worklist); + } + // The side-effect backward propagation should not expand the wqm-region. + // So we only need to run the propagation once. + ExeczSideEffectInstrs.clear(); + } } return GlobalFlags; diff --git a/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.mir b/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.mir index 650d76bc4f98a..523bcd1c02546 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.mir @@ -57,8 +57,8 @@ body: | ; CHECK-NEXT: [[COPY5:%[0-9]+]].sub7:sgpr_256 = COPY [[S_MOV_B32_]].sub0 ; CHECK-NEXT: [[IMAGE_SAMPLE_V1_V3_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V3_gfx12 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], [[COPY3]], [[COPY5]], [[S_MOV_B32_]], 1, 3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: IMAGE_ATOMIC_ADD_NORTN_V1_V2_gfx12 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], [[COPY5]], 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) - ; CHECK-NEXT: [[V_MED3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: $exec_lo = COPY [[S_AND_SAVEEXEC_B32_]] + ; CHECK-NEXT: [[V_MED3_F32_e64_:%[0-9]+]]:vgpr_32 = V_MED3_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MED3_F32_e64_]], implicit $exec ; CHECK-NEXT: [[S_MUL_F32_:%[0-9]+]]:sgpr_32 = nofpexcept S_MUL_F32 [[V_READFIRSTLANE_B32_]], 0, implicit $mode