diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 66bc46aaefea1..19a83ad53e2ed 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -12,6 +12,8 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass { const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; const MachineRegisterInfo *MRI = nullptr; + MCRegister Exec; + + DenseMap SaveExecVCmpMapping; + SmallVector, 1> OrXors; Register isCopyFromExec(const MachineInstr &MI) const; Register isCopyToExec(const MachineInstr &MI) const; @@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass { std::function Pred, ArrayRef NonModifiableRegs, unsigned MaxInstructions = 20) const; - MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, - MCRegister Exec) const; - bool optimizeExecSequence() const; - bool optimizeVCmpxAndSaveexecSequence() const; - bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, - MCRegister Exec) const; + bool optimizeExecSequence(); + void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); + bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec) const; + + void tryRecordOrSaveexecXorSequence(MachineInstr &MI); + bool optimizeOrSaveexecXorSequences(); public: static char ID; @@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && Src.getReg() == TRI->getExec()) + if (Src.isReg() && Src.getReg() == Exec) return MI.getOperand(0).getReg(); } } @@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == TRI->getExec() && - MI.getOperand(1).isReg()) + if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } @@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, // => // x = s__saveexec_b64 y // -bool SIOptimizeExecMasking::optimizeExecSequence() const { - MCRegister Exec = TRI->getExec(); - +bool SIOptimizeExecMasking::optimizeExecSequence() { bool Changed = false; for (MachineBasicBlock &MBB : *MF) { MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); @@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const { return Changed; } -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec) const { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(*MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, - true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs)) - return nullptr; - - return VCmp; -} - // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the // operands extracted from a v_cmp ..., s_and_saveexec pattern. -bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( +bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); @@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( if (Src1->isReg()) MRI->clearKillFlags(Src1->getReg()); + SaveExecInstr.eraseFromParent(); + VCmp.eraseFromParent(); + return true; } -// After all s_op_saveexec instructions are inserted, -// replace (on GFX10.3 and later) +// Record (on GFX10.3 and later) occurences of // v_cmp_* SGPR, IMM, VGPR // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR -// with +// to be replaced with // s_mov_b32 EXEC_SGPR_DEST, exec_lo // v_cmpx_* IMM, VGPR // to reduce pipeline stalls. -bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { +void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( + MachineInstr &MI) { if (!ST->hasGFX10_3Insts()) - return false; + return; - bool Changed = false; - - DenseMap SaveExecVCmpMapping; - MCRegister Exec = TRI->getExec(); const unsigned AndSaveExecOpcode = ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - for (MachineBasicBlock &MBB : *MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; + if (MI.getOpcode() != AndSaveExecOpcode) + return; + + Register SaveExecDest = MI.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return; - if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) - SaveExecVCmpMapping[&MI] = VCmp; + MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return; + + // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec + // sequence by looking at an instance of a s_and_saveexec instruction. Returns + // a pointer to the v_cmp instruction if it is safe to replace the sequence + // (see the conditions in the function body). This is after register + // allocation, so some checks on operand dependencies need to be considered. + MachineInstr *VCmp = nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + MI, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + MI.modifiesRegister(Src0->getReg(), TRI)) + return; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + MI.modifiesRegister(Src1->getReg(), TRI)) + return; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || + isRegisterInUseAfter(MI, VCmpDest->getReg())) + return; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs)) + return; + + if (VCmp) + SaveExecVCmpMapping[&MI] = VCmp; +} + +// Record occurences of +// s_or_saveexec s_o, s_i +// s_xor exec, exec, s_o +// to be replaced with +// s_andn2_saveexec s_o, s_i. +void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { + const unsigned XorOpcode = + ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; + + if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { + const MachineOperand &XorDst = MI.getOperand(0); + const MachineOperand &XorSrc0 = MI.getOperand(1); + const MachineOperand &XorSrc1 = MI.getOperand(2); + + if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && + XorSrc1.isReg() && + (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { + const unsigned OrSaveexecOpcode = ST->isWave32() + ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64; + + // Peek at the previous instruction and check if this is a relevant + // s_or_saveexec instruction. + MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); + if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) + return; + + const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); + const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); + if (OrDst.isReg() && OrSrc0.isReg()) { + if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || + (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { + OrXors.emplace_back(&PossibleOrSaveexec, &MI); + } + } } } +} - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); +bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { + if (OrXors.empty()) { + return false; + } - if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + bool Changed = false; + const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 + : AMDGPU::S_ANDN2_SAVEEXEC_B64; - Changed = true; - } + for (const auto &Pair : OrXors) { + MachineInstr *Or = nullptr; + MachineInstr *Xor = nullptr; + std::tie(Or, Xor) = Pair; + BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), + TII->get(Andn2Opcode), Or->getOperand(0).getReg()) + .addReg(Or->getOperand(1).getReg()); + + Or->eraseFromParent(); + Xor->eraseFromParent(); + + Changed = true; } return Changed; @@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { TRI = ST->getRegisterInfo(); TII = ST->getInstrInfo(); MRI = &MF.getRegInfo(); + Exec = TRI->getExec(); bool Changed = optimizeExecSequence(); - Changed |= optimizeVCmpxAndSaveexecSequence(); + + OrXors.clear(); + SaveExecVCmpMapping.clear(); + static unsigned SearchWindow = 10; + for (MachineBasicBlock &MBB : MF) { + unsigned SearchCount = 0; + + for (auto &MI : llvm::reverse(MBB)) { + if (MI.isDebugInstr()) + continue; + + if (SearchCount >= SearchWindow) { + break; + } + + tryRecordOrSaveexecXorSequence(MI); + tryRecordVCmpxAndSaveexecSequence(MI); + + if (MI.modifiesRegister(Exec, TRI)) { + break; + } + + ++SearchCount; + } + } + + Changed |= optimizeOrSaveexecXorSequences(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); + } return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index af73bffeca09b..a1e40dd5da91d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -32,8 +32,7 @@ define amdgpu_cs void @memmove_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src ; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_2 ; LOOP-NEXT: .LBB0_3: ; %Flow14 -; LOOP-NEXT: s_or_saveexec_b64 s[0:1], s[4:5] -; LOOP-NEXT: s_xor_b64 exec, exec, s[0:1] +; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5] ; LOOP-NEXT: s_cbranch_execz .LBB0_6 ; LOOP-NEXT: ; %bb.4: ; %copy_backwards ; LOOP-NEXT: s_mov_b64 s[4:5], 3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index f1abd389a2391..672b4d9445d33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -173,8 +173,14 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: ; %bb1 +; GFX9-NEXT: s_cbranch_execnz .LBB2_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB2_4 +; GFX9-NEXT: .LBB2_2: ; %bb2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB2_3: ; %bb1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12 @@ -187,11 +193,9 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: ; %bb0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: .LBB2_4: ; %bb0 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12 @@ -204,9 +208,7 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB2_4: ; %bb2 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 22d07a4ca07d4..64ab731adb1f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -16,8 +16,14 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc @@ -151,11 +157,9 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -176,7 +180,6 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, %den @@ -787,8 +790,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -819,8 +821,14 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB2_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB2_8 +; CGP-NEXT: .LBB2_6: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc @@ -954,11 +962,9 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: .LBB2_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB2_6 +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -979,7 +985,6 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, %den @@ -2328,8 +2333,14 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc @@ -2463,11 +2474,9 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2488,7 +2497,6 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2953,8 +2961,14 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB8_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB8_8 +; CGP-NEXT: .LBB8_6: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc @@ -3088,11 +3102,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: .LBB8_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB8_6 +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -3113,7 +3125,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index b647e66a74cc5..91e435aad7aef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -16,8 +16,14 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc @@ -149,11 +155,9 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -172,7 +176,6 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, %den @@ -775,8 +778,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -805,8 +807,14 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB2_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB2_8 +; CGP-NEXT: .LBB2_6: +; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc @@ -938,11 +946,9 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: .LBB2_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB2_6 +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -961,7 +967,6 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, %den @@ -2294,8 +2299,14 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc @@ -2427,11 +2438,9 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2450,7 +2459,6 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2910,8 +2918,14 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB8_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB8_8 +; CGP-NEXT: .LBB8_6: +; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc @@ -3043,11 +3057,9 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: .LBB8_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB8_6 +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -3066,7 +3078,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index ebb7381d0ae1a..346b7d2deb18b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -16,8 +16,14 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 @@ -145,11 +151,9 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -170,7 +174,6 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, %den @@ -757,8 +760,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -789,8 +791,14 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB2_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB2_8 +; CGP-NEXT: .LBB2_6: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 @@ -918,11 +926,9 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: .LBB2_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB2_6 +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -943,7 +949,6 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, %den @@ -1073,8 +1078,14 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 @@ -1202,11 +1213,9 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1227,7 +1236,6 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -1672,8 +1680,14 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB8_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB8_8 +; CGP-NEXT: .LBB8_6: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 @@ -1801,11 +1815,9 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: .LBB8_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB8_6 +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1826,7 +1838,6 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index e6b5660d000f5..f2ad41481eca4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -16,8 +16,14 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 @@ -144,11 +150,9 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -167,7 +171,6 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, %den @@ -748,8 +751,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -778,8 +780,14 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB2_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB2_8 +; CGP-NEXT: .LBB2_6: +; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 @@ -906,11 +914,9 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: .LBB2_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB2_6 +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -929,7 +935,6 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, %den @@ -1612,8 +1617,14 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execnz .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 @@ -1740,11 +1751,9 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1763,7 +1772,6 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y @@ -2203,8 +2211,14 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: ; %bb.5: +; CGP-NEXT: s_cbranch_execnz .LBB8_7 +; CGP-NEXT: ; %bb.5: ; %Flow +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execnz .LBB8_8 +; CGP-NEXT: .LBB8_6: +; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: s_setpc_b64 s[30:31] +; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 @@ -2331,11 +2345,9 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: .LBB8_6: ; %Flow -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_8 -; CGP-NEXT: ; %bb.7: +; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_cbranch_execz .LBB8_6 +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -2354,7 +2366,6 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index a791676e618b1..a4e2af802b73a 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -431,9 +431,9 @@ endif: ; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN-NEXT: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]] -; GCN: BB{{[0-9]+_[0-9]+}}: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]] -; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]] +; GCN: .LBB{{[0-9]+_[0-9]+}}: ; %Flow1 +; GCN-NEXT: s_andn2_saveexec_b64 [[MASK]], [[MASK]] +; GCN-NEXT: s_cbranch_execnz ; GCN: .L[[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}} ; GCN: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 9077a28571375..2338add43d06c 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -123,8 +123,7 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB0_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] ; GFX9-NEXT: s_cbranch_execz .LBB0_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 @@ -260,8 +259,7 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB1_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] ; GFX9-NEXT: s_cbranch_execz .LBB1_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 @@ -410,8 +408,7 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 @@ -544,8 +541,7 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB3_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 @@ -831,8 +827,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB8_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; GFX9-NEXT: s_cbranch_execz .LBB8_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 @@ -984,8 +979,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB9_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 23879e5cb7685..f81c46ee2439b 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -149,8 +149,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN: {{^}}[[THEN_INNER]]: -; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]] -; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]] +; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_INNER2]], [[SAVEEXEC_INNER2]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: @@ -241,8 +240,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]] ; GCN: {{^}}[[THEN_OUTER]]: -; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]] -; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN-NEXT: s_andn2_saveexec_b64 [[SAVEEXEC_OUTER2]], [[SAVEEXEC_OUTER2]] ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword @@ -252,7 +250,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN: store_dword ; GCN-NEXT: [[FLOW1]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]] -; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER2]] ; GCN: ds_write_b32 ; GCN: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index 479d20586a562..85a67248eb88f 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -3,8 +3,7 @@ ; CHECK-LABEL: {{^}}else_no_execfix: ; CHECK: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], -; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] +; CHECK-NEXT: s_andn2_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], [[DST]] define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 5be5607058d50..930a1b7b931b2 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -28,8 +28,7 @@ define amdgpu_ps void @return_void(float %0) #0 { ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB0_1 ; CHECK-NEXT: .LBB0_3: ; %Flow1 -; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 @@ -73,8 +72,7 @@ define amdgpu_ps void @return_void_compr(float %0) #0 { ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB1_1 ; CHECK-NEXT: .LBB1_3: ; %Flow1 -; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB1_5 ; CHECK-NEXT: ; %bb.4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 32604da2fb535..1244ab28c14f3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -182,8 +182,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: ; %bb.1: ; %ELSE ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen ; CHECK-NEXT: .LBB6_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; CHECK-NEXT: s_cbranch_execz .LBB6_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 @@ -238,8 +237,7 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[16:17] -; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB7_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index b7bed2b3b1cc9..adc6db21f818d 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -73,8 +73,7 @@ ; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec ; GCN: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 -; GCN-NEXT: s_xor_b64 +; GCN-NEXT: s_andn2_saveexec_b64 ; GCN: ; %LeafBlock ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, diff --git a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir new file mode 100644 index 0000000000000..c5d667158afdf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir @@ -0,0 +1,127 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s + +--- + +# After the Optimize exec masking (post-RA) pass, codegen can end up with the following sequence: +# s_or_saveexec_b32 s0, s0 +# s_xor_b32 exec_lo, exec_lo, s0 +# +# This can be combined into one instruction: +# s_andn2_saveexec_b32 s0, s0 + +# Ensure the transformation gets applied in the b32 case. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32 +# WAVE32: S_ANDN2_SAVEEXEC_B32 +name: s_or_saveexec_xor_combine_b32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec + $exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr0, implicit-def $scc +... + +--- + +# Ensure the transformation gets applied in the b64 case. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b64 +# WAVE64: S_ANDN2_SAVEEXEC_B64 +name: s_or_saveexec_xor_combine_b64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec + $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc +... + +--- + +# Ensure the transformation does get applied even if the operands are swapped. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_swap +# WAVE32: S_ANDN2_SAVEEXEC_B32 +name: s_or_saveexec_xor_combine_b32_swap +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec + $exec_lo = S_XOR_B32 renamable $sgpr0, $exec_lo, implicit-def $scc +... + +--- + +# Ensure the transformation does get applied if source and dest operand for s_or_saveeexec are not equal. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_inequal_operands +# WAVE32: S_ANDN2_SAVEEXEC +name: s_or_saveexec_xor_combine_b32_inequal_operands +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec + $exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr0, implicit-def $scc +... + +--- + +# Ensure the transformation does not get applied if s_xor does not use the dest as input operand. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_wrong_input +# WAVE32: S_OR_SAVEEXEC +# WAVE32: S_XOR_B32 +name: s_or_saveexec_xor_combine_b32_wrong_input +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec + $exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr1, implicit-def $scc +... + +--- + + +# Ensure the transformation does not get applied if the instructions don't appear sequentially. +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_non_sequence +# WAVE32: S_OR_SAVEEXEC +# WAVE32: S_MOV_B32 +# WAVE32: S_XOR_B32 +name: s_or_saveexec_xor_combine_b32_non_sequence +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec + renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + $exec_lo = S_XOR_B32 $exec_lo, renamable $sgpr1, implicit-def $scc +... + +--- + +# Don't apply the transformation if the basic block only has a single instruction. + +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_last_inst +# WAVE32: S_OR_SAVEEXEC +name: s_or_saveexec_xor_combine_b32_last_inst +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec +... + +--- + +# Don't apply the transformation if the basic block ends with an S_OR_SAVEEXEC_B32 instruction. + +# GCN-LABEL: name: s_or_saveexec_xor_combine_b32_or_saveexec_terminator +# WAVE32: S_MOV_B32 +# WAVE32: S_OR_SAVEEXEC +name: s_or_saveexec_xor_combine_b32_or_saveexec_terminator +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 19bdd90ee6d9a..a0deb4572e0dd 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -73,8 +73,13 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_xor_b32 s0, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_4 -; GCN-NEXT: ; %bb.3: ; %.else +; GCN-NEXT: s_cbranch_execnz .LBB1_5 +; GCN-NEXT: ; %bb.3: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 +; GCN-NEXT: s_cbranch_execnz .LBB1_6 +; GCN-NEXT: .LBB1_4: ; %.end +; GCN-NEXT: s_endpgm +; GCN-NEXT: .LBB1_5: ; %.else ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 exec_lo, s1 @@ -89,15 +94,11 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-NEXT: v_mov_b32_e32 v3, -1 ; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen ; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: .LBB1_4: ; %Flow -; GCN-NEXT: s_or_saveexec_b32 s0, s0 -; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_6 -; GCN-NEXT: ; %bb.5: ; %.then +; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: .LBB1_6: ; %.then ; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen -; GCN-NEXT: .LBB1_6: ; %.end ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 15384e670cbb5..bf9041c15969b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -174,8 +174,7 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[10:11] -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_mov_b32 s15, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index 21d45a9c3bf58..ee4978b97d5ad 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -81,58 +81,60 @@ else: ; preds = %else.if.cond } define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 { - ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill - ; GCN: bb.0.entry: - ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) - ; GCN: liveins: $vgpr0 - ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec - ; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc - ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; GCN: bb.1.flow.preheader: - ; GCN: successors: %bb.2(0x80000000) - ; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0 - ; GCN: bb.2.flow: - ; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 - ; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc - ; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; GCN: bb.3.Flow: - ; GCN: successors: %bb.4(0x80000000) - ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN: bb.4.Flow1: - ; GCN: successors: %bb.5(0x40000000), %bb.7(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN: bb.5.kill0: - ; GCN: successors: %bb.6(0x40000000), %bb.8(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc - ; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc - ; GCN: bb.6.kill0: - ; GCN: successors: %bb.7(0x80000000) - ; GCN: liveins: $sgpr2_sgpr3, $scc - ; GCN: $exec = S_MOV_B64 0 - ; GCN: bb.7.end: - ; GCN: successors: %bb.9(0x80000000) - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN: S_BRANCH %bb.9 - ; GCN: bb.8: - ; GCN: $exec = S_MOV_B64 0 - ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; GCN: S_ENDPGM 0 - ; GCN: bb.9: -entry: +; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill +; GCN: bb.0 (%ir-block.0): +; GCN: successors: %bb.3(0x40000000), %bb.1(0x40000000) +; GCN: liveins: $vgpr0 +; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec +; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec +; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec +; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec +; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc +; GCN: S_CBRANCH_EXECNZ %bb.3, implicit $exec +; GCN: bb.1.Flow1: +; GCN: successors: %bb.6(0x40000000), %bb.2(0x40000000) +; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 +; GCN: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec +; GCN: S_CBRANCH_EXECNZ %bb.6, implicit $exec +; GCN: bb.2.end: +; GCN: successors: %bb.9(0x80000000) +; GCN: liveins: $sgpr2_sgpr3 +; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc +; GCN: S_BRANCH %bb.9 +; GCN: bb.3.flow.preheader: +; GCN: successors: %bb.4(0x80000000) +; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 +; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec +; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0 +; GCN: bb.4.flow: +; GCN: successors: %bb.5(0x04000000), %bb.4(0x7c000000) +; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 +; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc +; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc +; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc +; GCN: S_CBRANCH_EXECNZ %bb.4, implicit $exec +; GCN: bb.5.Flow: +; GCN: successors: %bb.6(0x40000000), %bb.2(0x40000000) +; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 +; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc +; GCN: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec +; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec +; GCN: bb.6.kill0: +; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000) +; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 +; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc +; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc +; GCN: bb.7.kill0: +; GCN: successors: %bb.9(0x80000000) +; GCN: liveins: $sgpr2_sgpr3, $scc +; GCN: $exec = S_MOV_B64 0 +; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc +; GCN: S_BRANCH %bb.9 +; GCN: bb.8: +; GCN: $exec = S_MOV_B64 0 +; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec +; GCN: S_ENDPGM 0 +; GCN: bb.9: %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val %cmp0 = fcmp olt float %.i0, 0.000000e+00 br i1 %cmp0, label %kill0, label %flow diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 7447e12bb74e1..12a358c299c05 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -22,8 +22,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; v_mov should be after exec modification ; SI: [[FLOW_BB]]: -; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] -; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] +; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]] ; define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: @@ -121,8 +120,7 @@ exit: ; SI: s_cbranch_execnz [[EXIT:.LBB[0-9]+_[0-9]+]] ; SI-NEXT: {{^.LBB[0-9]+_[0-9]+}}: ; %Flow -; SI-NEXT: s_or_saveexec_b64 -; SI-NEXT: s_xor_b64 exec, exec +; SI-NEXT: s_andn2_saveexec_b64 [[BR_SREG]], [[BR_SREG]] ; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:.LBB[0-9]+_[0-9]+]] ; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then @@ -163,7 +161,6 @@ exit: ; SI: s_cbranch_scc1 [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm - define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -223,7 +220,6 @@ exit: ; SI: [[LABEL_EXIT]]: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm - define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll index f8aae95f62b12..9dcd3a66a16db 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -134,7 +134,7 @@ l2: ; any of the v_cmp source operands. ; GCN-LABEL: check_saveexec_overwrites_vcmp_source: -; GCN: ; %bb.1: ; %then +; GCN: .LBB7_3: ; %then ; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}} ; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]] ; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index b9e03a6689094..0eb614c11142f 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -9,17 +9,23 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s0, exec_lo, s0 -; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b32 s0, s0 +; SI-NEXT: s_cbranch_execnz .LBB0_4 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_branch .LBB0_5 +; SI-NEXT: .LBB0_3: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; %bb.2: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s0, s0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: s_andn2_saveexec_b32 s0, s0 +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: .LBB0_4: ; %if ; SI-NEXT: v_add_f32_e32 v0, v1, v1 -; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; SI-NEXT: ; return to shader part epilog +; SI-NEXT: s_branch .LBB0_5 +; SI-NEXT: .LBB0_5: main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -49,8 +55,7 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; SI-NEXT: ; %bb.2: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s0, s0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_andn2_saveexec_b32 s0, s0 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: v_add_f32_e32 v1, v1, v1 ; SI-NEXT: v_mov_b32_e32 v0, v1 @@ -104,8 +109,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; %bb.4: ; %Flow ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; SI-NEXT: s_or_saveexec_b32 s2, s2 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s2 +; SI-NEXT: s_andn2_saveexec_b32 s2, s2 ; SI-NEXT: s_cbranch_execz .LBB2_1 ; SI-NEXT: ; %bb.5: ; %if ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 @@ -191,8 +195,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB3_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s6, s6 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6 +; SI-NEXT: s_andn2_saveexec_b32 s6, s6 ; SI-NEXT: s_cbranch_execz .LBB3_8 ; SI-NEXT: ; %bb.5: ; %if ; SI-NEXT: s_mov_b32 s7, exec_lo @@ -267,8 +270,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB4_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s6, s6 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6 +; SI-NEXT: s_andn2_saveexec_b32 s6, s6 ; SI-NEXT: s_cbranch_execz .LBB4_8 ; SI-NEXT: ; %bb.5: ; %if ; SI-NEXT: s_mov_b32 s7, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 7cfb43ba802ac..d1e867e526bc3 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1232,8 +1232,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-W64-NEXT: .LBB23_2: ; %Flow -; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] -; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] ; GFX9-W64-NEXT: s_cbranch_execz .LBB23_4 ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 @@ -1260,8 +1259,7 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 ; GFX10-W32-NEXT: .LBB23_2: ; %Flow -; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 -; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB23_4 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D @@ -1396,8 +1394,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX9-W64-NEXT: ; implicit-def: $vgpr5 ; GFX9-W64-NEXT: ; %bb.2: ; %Flow -; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] -; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3 ; GFX9-W64-NEXT: ; %bb.4: ; %END @@ -1427,8 +1424,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX10-W32-NEXT: ; implicit-def: $vgpr5 ; GFX10-W32-NEXT: ; %bb.2: ; %Flow -; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 -; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3 ; GFX10-W32-NEXT: ; %bb.4: ; %END @@ -1486,18 +1482,25 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: ; %bb.1: ; %ELSE +; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_3 +; GFX9-W64-NEXT: ; %bb.1: ; %Flow +; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-W64-NEXT: .LBB26_2: ; %END +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: s_branch .LBB26_5 +; GFX9-W64-NEXT: .LBB26_3: ; %ELSE ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr1 -; GFX9-W64-NEXT: ; %bb.2: ; %Flow -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9-W64-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-W64-NEXT: .LBB26_4: ; %IF ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 -; GFX9-W64-NEXT: ; %bb.4: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: ; return to shader part epilog +; GFX9-W64-NEXT: s_branch .LBB26_5 +; GFX9-W64-NEXT: .LBB26_5: ; ; GFX10-W32-LABEL: test_control_flow_3: ; GFX10-W32: ; %bb.0: ; %main_body @@ -1513,18 +1516,25 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: ; %bb.1: ; %ELSE +; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_3 +; GFX10-W32-NEXT: ; %bb.1: ; %Flow +; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 +; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_4 +; GFX10-W32-NEXT: .LBB26_2: ; %END +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-W32-NEXT: s_branch .LBB26_5 +; GFX10-W32-NEXT: .LBB26_3: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1 -; GFX10-W32-NEXT: ; %bb.2: ; %Flow -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s0 -; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 +; GFX10-W32-NEXT: s_cbranch_execz .LBB26_2 +; GFX10-W32-NEXT: .LBB26_4: ; %IF ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 -; GFX10-W32-NEXT: ; %bb.4: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-W32-NEXT: ; return to shader part epilog +; GFX10-W32-NEXT: s_branch .LBB26_5 +; GFX10-W32-NEXT: .LBB26_5: main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0