diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3c53323dc93dd3..453be72ad88155 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3860,18 +3860,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, unsigned Op32) const { - MachineBasicBlock *MBB = MI.getParent();; + MachineBasicBlock *MBB = MI.getParent(); MachineInstrBuilder Inst32 = BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); - if (Op32DstIdx != -1) { + if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) { // dst Inst32.add(MI.getOperand(0)); - } else { + } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) { + // VOPCX instructions won't be writing to an explicit dst, so this should + // not fail for these instructions. assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 25d3f4a765e6b1..4b7589002a7182 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1263,6 +1263,10 @@ namespace AMDGPU { LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode); + /// \returns v_cmpx version of a v_cmp instruction. + LLVM_READONLY + int getVCMPXOpFromVCMP(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index a81c49e82ebab1..787814172135fb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2681,6 +2681,15 @@ def getMFMAEarlyClobberOp : InstrMapping { let ValueCols = [["0"]]; } +// Maps an v_cmp instruction to its v_cmpx equivalent. +def getVCMPXOpFromVCMP : InstrMapping { + let FilterClass = "VCMPVCMPXTable"; + let RowFields = ["VCMPOp"]; + let ColFields = ["IsVCMPX"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 9a4cc25f000853..e45642b934d281 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -292,6 +293,182 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { return false; } +// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either +// the beginning of the BB is reached or Pred evaluates to true - which can be +// an arbitrary condition based on the current MachineInstr, for instance an +// target instruction. Breaks prematurely by returning nullptr if one of the +// registers given in NonModifiableRegs is modified by the current instruction. +static MachineInstr * +findInstrBackwards(MachineInstr &Origin, + std::function Pred, + ArrayRef NonModifiableRegs, + const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) { + MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), + E = Origin.getParent()->rend(); + unsigned CurrentIteration = 0; + + for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { + if (Pred(&*A)) + return &*A; + + for (MCRegister Reg : NonModifiableRegs) { + if (A->modifiesRegister(Reg, TRI)) + return nullptr; + } + + ++CurrentIteration; + } + + return nullptr; +} + +// Determine if a register Reg is not re-defined and still in use +// in the range (Stop..BB.end]. +// It does so by backwards calculating liveness from the end of the BB until +// either Stop or the beginning of the BB is reached. +// After liveness is calculated, we can determine if Reg is still in use and not +// defined inbetween the instructions. +static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, + const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI) { + LivePhysRegs LR(*TRI); + LR.addLiveOuts(*Stop.getParent()); + + for (auto A = Stop.getParent()->rbegin(); + A != Stop.getParent()->rend() && A != Stop; ++A) { + LR.stepBackward(*A); + } + + return !LR.available(MRI, Reg); +} + +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +static MachineInstr *findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, + const SIInstrInfo *TII, MachineRegisterInfo &MRI) { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}, TRI); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use after the s_and_saveexec, skip the + // optimization. + if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs, TRI)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI) { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; + unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src0); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src1); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1) + Builder.addImm(0); + + return true; +} + bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -299,6 +476,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally @@ -462,5 +640,45 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { Changed = true; } + // After all s_op_saveexec instructions are inserted, + // replace (on GFX10.3 and later) + // v_cmp_* SGPR, IMM, VGPR + // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR + // with + // s_mov_b32 EXEC_SGPR_DEST, exec_lo + // v_cmpx_* IMM, VGPR + // to reduce pipeline stalls. + if (ST.hasGFX10_3Insts()) { + DenseMap SaveExecVCmpMapping; + const unsigned AndSaveExecOpcode = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = + findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) + SaveExecVCmpMapping[&MI] = VCmp; + } + } + + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, + TRI, *MRI)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); + + Changed = true; + } + } + } + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index e437552c2afdc3..880df8bdb67ec3 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -728,21 +728,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - Register DstReg = MI.getOperand(0).getReg(); - if (DstReg.isVirtual()) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because this is only one register and - // cannot deal with sequences which would require multiple copies of - // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) - // - // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we will run - // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); - continue; + MachineOperand &Op0 = MI.getOperand(0); + if (Op0.isReg()) { + // Exclude VOPCX instructions as these don't explicitly write a + // dst. + Register DstReg = Op0.getReg(); + if (DstReg.isVirtual()) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) + // + // So, instead of forcing the instruction to write to VCC, we + // provide a hint to the register allocator to use VCC and then we + // will run this pass again after RA and shrink it if it outputs to + // VCC. + MRI.setRegAllocationHint(DstReg, 0, VCCReg); + continue; + } + if (DstReg != VCCReg) + continue; } - if (DstReg != VCCReg) - continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index c0cc91029d1116..1220b5c8ac35d5 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -205,6 +205,11 @@ class VCMPXNoSDstTable { string NoSDstOp = Name; } +class VCMPVCMPXTable { + bit IsVCMPX = 0; + string VCMPOp = Name; +} + multiclass VOPC_Pseudos , Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e32"> { + VCMPXNoSDstTable<1, opName#"_e32">, + VCMPVCMPXTable { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -223,7 +229,8 @@ multiclass VOPC_Pseudos .ret>, Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e64"> { + VCMPXNoSDstTable<1, opName#"_e64">, + VCMPVCMPXTable { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -248,23 +255,27 @@ multiclass VOPCX_Pseudos , Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e32"> { + VCMPXNoSDstTable<0, opName#"_e32">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isConvergent = 1; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + let IsVCMPX = 1; } def _nosdst_e64 : VOP3_Pseudo, Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e64"> { + VCMPXNoSDstTable<0, opName#"_e64">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + let IsVCMPX = 1; } foreach _ = BoolToList.ret in diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll index 2fed25fff3d50c..a2c35b97aef65e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug: ; GFX1030: s_cmp_lg_u32 -; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] +; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] ; GFX1010: s_cmp_lg_u32 ; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]] @@ -51,9 +51,9 @@ bb3: } ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug: -; GFX1030: v_cmp_eq_u32 -; GFX1030: s_and_saveexec_b32 -; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] +; GFX1030: s_mov_b32 +; GFX1030: v_cmpx_eq_u32 +; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] ; GFX1010: v_cmp_eq_u32 ; GFX1010: s_and_saveexec_b32 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll new file mode 100644 index 00000000000000..f8aae95f62b12b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}} +define i32 @test_insert_vcmpx_pattern_lt(i32 %x) { +entry: + %bc = icmp slt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}} +define i32 @test_insert_vcmpx_pattern_gt(i32 %x) { +entry: + %bc = icmp sgt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq: +; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_eq(i32 %x) { +entry: + %bc = icmp eq i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne: +; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ne(i32 %x) { +entry: + %bc = icmp ne i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_le(i32 %x) { +entry: + %bc = icmp sle i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ge(i32 %x) { +entry: + %bc = icmp sge i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +declare amdgpu_gfx void @check_live_outs_helper(i64) #0 + +; In cases where the output operand cannot be safely removed, +; don't apply the v_cmpx transformation. + +; GCN-LABEL: {{^}}check_live_outs: +; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}} +; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}} +; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}} +; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}} +define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) { + %cond = icmp eq i32 %a, %b + %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32) + br i1 %cond, label %l1, label %l2 +l1: + call amdgpu_gfx void @check_live_outs_helper(i64 %result) + br label %l2 +l2: + ret void +} + +; Omit the transformation if the s_and_saveexec instruction overwrites +; any of the v_cmp source operands. + +; GCN-LABEL: check_saveexec_overwrites_vcmp_source: +; GCN: ; %bb.1: ; %then +; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} +; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] +; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo +; GFX1030: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} +; GFX1030-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] +; GFX1030-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo +define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) { +entry: + %0 = icmp sge i32 %a, 0 + br i1 %0, label %if, label %then + +if: + %1 = shl i32 %a, 2 + %2 = or i32 %1, %b + ret i32 %2 + +then: + %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32) + %4 = trunc i64 %3 to i32 + %5 = icmp slt i32 %4, %b + br i1 %5, label %after, label %end + +after: + ret i32 %4 + +end: + ret i32 %a +} + +declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir new file mode 100644 index 00000000000000..52cd6f5d476046 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir @@ -0,0 +1,24 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- + +# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions +# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions +# which can be safely replaced in various cases. +# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask +# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction. +# The idea behind this test is to check if the subregisters are correctly handled here. + +# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_exec_subreg +# GCN: V_CMP_GT_U32_e64 +# GCN: S_AND_SAVEEXEC_B64 +name: vcmp_saveexec_to_mov_vcmpx_exec_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $sgpr2 + renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $sgpr2, killed $vgpr0, implicit $exec + $sgpr2_sgpr3 = COPY $exec, implicit-def $exec + $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index e462a460c93fa6..7cfb43ba802ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1250,8 +1250,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE @@ -1329,8 +1329,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF @@ -1508,10 +1508,10 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 @@ -1577,8 +1577,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 @@ -2960,9 +2960,9 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo