diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 337218565f5fd0..2617558b257507 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -741,16 +741,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // To insert the loop we need to split the block. Move everything before this // point to a new block, and insert a new empty block before this instruction. MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); MachineFunction::iterator MBBI(MBB); ++MBBI; MF->insert(MBBI, LoopBB); + MF->insert(MBBI, BodyBB); MF->insert(MBBI, RestoreExecBB); MF->insert(MBBI, RemainderBB); - LoopBB->addSuccessor(RestoreExecBB); - LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(RestoreExecBB); + BodyBB->addSuccessor(LoopBB); // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); @@ -762,26 +765,26 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*LoopBB, LoopBB->end()); B.buildInstr(TargetOpcode::PHI) - .addDef(PhiExec) - .addReg(InitSaveExecReg) - .addMBB(&MBB) - .addReg(NewExec) - .addMBB(LoopBB); + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(BodyBB); const DebugLoc &DL = B.getDL(); MachineInstr &FirstInst = *Range.begin(); - // Move the instruction into the loop. Note we moved everything after + // Move the instruction into the loop body. Note we moved everything after // Range.end() already into a new block, so Range.end() is no longer valid. - LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); // Figure out the iterator range after splicing the instructions. MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); - auto NewEnd = LoopBB->end(); + auto NewEnd = BodyBB->end(); - MachineBasicBlock::iterator I = Range.begin(); - B.setInsertPt(*LoopBB, I); + MachineBasicBlock::iterator I = LoopBB->end(); + B.setMBB(*LoopBB); Register CondReg; @@ -813,7 +816,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setMBB(MBB); OpReg = B.buildCopy(OpTy, OpReg).getReg(0); MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); - B.setInstr(*I); + B.setMBB(*LoopBB); } unsigned OpSize = OpTy.getSizeInBits(); @@ -879,7 +882,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) UnmergePieces.push_back(Unmerge.getReg(PieceIdx)); } - B.setInstr(*I); + B.setMBB(*LoopBB); for (Register UnmergePiece : UnmergePieces) { Register CurrentLaneOpReg; @@ -978,7 +981,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MRI.setSimpleHint(NewExec, CondReg); - B.setInsertPt(*LoopBB, LoopBB->end()); + B.setInsertPt(*BodyBB, BodyBB->end()); // Update EXEC, switch all done bits to 0 and all todo bits to 1. B.buildInstr(XorTermOpc) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 453be72ad88155..6f0a61a43070db 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5464,7 +5464,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, static void emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, - const DebugLoc &DL, MachineOperand &Rsrc) { + MachineBasicBlock &BodyBB, const DebugLoc &DL, + MachineOperand &Rsrc) { MachineFunction &MF = *OrigBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -5557,14 +5558,14 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, .addReg(CondReg, RegState::Kill); // The original instruction is here; we insert the terminators after it. - I = LoopBB.end(); + I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) .addReg(Exec) .addReg(SaveExec); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); + BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register @@ -5611,31 +5612,35 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, // To insert the loop we need to split the block. Move everything after this // point to a new block, and insert a new empty block between the two. MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); MachineFunction::iterator MBBI(MBB); ++MBBI; MF.insert(MBBI, LoopBB); + MF.insert(MBBI, BodyBB); MF.insert(MBBI, RemainderBB); - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(LoopBB); + BodyBB->addSuccessor(RemainderBB); - // Move Begin to MI to the LoopBB, and the remainder of the block to + // Move Begin to MI to the BodyBB, and the remainder of the block to // RemainderBB. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); - LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); + BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); MBB.addSuccessor(LoopBB); // Update dominators. We know that MBB immediately dominates LoopBB, that - // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately - // dominates all of the successors transferred to it from MBB that MBB used - // to properly dominate. + // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates + // RemainderBB. RemainderBB immediately dominates all of the successors + // transferred to it from MBB that MBB used to properly dominate. if (MDT) { MDT->addNewBlock(LoopBB, &MBB); - MDT->addNewBlock(RemainderBB, LoopBB); + MDT->addNewBlock(BodyBB, LoopBB); + MDT->addNewBlock(RemainderBB, BodyBB); for (auto &Succ : RemainderBB->successors()) { if (MDT->properlyDominates(&MBB, Succ)) { MDT->changeImmediateDominator(Succ, RemainderBB); @@ -5643,12 +5648,12 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc); // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); - return LoopBB; + return BodyBB; } // Extract pointer from Rsrc and return a zero-value Rsrc replacement. diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index e13e33ed545782..07232247c4bdee 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -112,8 +112,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass { SmallVectorImpl &CandidateRegs) const; void collectWaterfallCandidateRegisters( - MachineBasicBlock *Loop, - SmallSetVector &CandidateRegs) const; + MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd, + SmallSetVector &CandidateRegs, + SmallSetVector &Blocks, + SmallVectorImpl &Instructions) const; void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, SmallVectorImpl &Uses) const; @@ -131,7 +133,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass { MachineBasicBlock *Flow, MachineBasicBlock *Endif, SmallSetVector &ElseBlocks) const; - void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const; + void optimizeWaterfallLiveRange( + Register Reg, MachineBasicBlock *LoopHeader, + SmallSetVector &LoopBlocks, + SmallVectorImpl &Instructions) const; SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} @@ -323,12 +328,30 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters( /// Collect the registers used in the waterfall loop block that are defined /// before. void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( - MachineBasicBlock *Loop, - SmallSetVector &CandidateRegs) const { + MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd, + SmallSetVector &CandidateRegs, + SmallSetVector &Blocks, + SmallVectorImpl &Instructions) const { + + // Collect loop instructions, potentially spanning multiple blocks + auto *MBB = LoopHeader; + for (;;) { + Blocks.insert(MBB); + for (auto &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Instructions.push_back(&MI); + } + if (MBB == LoopEnd) + break; + assert(MBB->pred_size() == 1 || + (MBB == LoopHeader && MBB->pred_size() == 2)); + assert(MBB->succ_size() == 1); + MBB = *MBB->succ_begin(); + } - for (auto &MI : Loop->instrs()) { - if (MI.isDebugInstr()) - continue; + for (auto *I : Instructions) { + auto &MI = *I; for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg() || MO.isDef()) @@ -340,16 +363,17 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( continue; if (MO.readsReg()) { - const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); // Make sure the value is defined before the LOOP block - if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) { + if (!Blocks.contains(DefMBB) && !CandidateRegs.contains(MOReg)) { // If the variable is used after the loop, the register coalescer will // merge the newly created register and remove the phi node again. // Just do nothing in that case. LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg); bool IsUsed = false; - for (auto *Succ : Loop->successors()) { - if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { + for (auto *Succ : LoopEnd->successors()) { + if (!Blocks.contains(Succ) && + OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { IsUsed = true; break; } @@ -513,7 +537,9 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange( } void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( - Register Reg, MachineBasicBlock *Loop) const { + Register Reg, MachineBasicBlock *LoopHeader, + SmallSetVector &Blocks, + SmallVectorImpl &Instructions) const { // Insert a new PHI, marking the value from the last loop iteration undef. LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); const auto *RC = MRI->getRegClass(Reg); @@ -525,15 +551,16 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { auto *UseMI = O.getParent(); auto *UseBlock = UseMI->getParent(); - // Replace uses in Loop block - if (UseBlock == Loop) + // Replace uses in Loop blocks + if (Blocks.contains(UseBlock)) O.setReg(NewReg); } - MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(), - TII->get(TargetOpcode::PHI), NewReg); - for (auto *Pred : Loop->predecessors()) { - if (Pred == Loop) + MachineInstrBuilder PHI = + BuildMI(*LoopHeader, LoopHeader->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : LoopHeader->predecessors()) { + if (Blocks.contains(Pred)) PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); else PHI.addReg(Reg).addMBB(Pred); @@ -542,21 +569,36 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); - // collectWaterfallCandidateRegisters only collects registers that are dead - // after the loop. So we know that the old reg is not live throughout the - // whole block anymore. - OldVarInfo.AliveBlocks.reset(Loop->getNumber()); - - // Mark the last use as kill - for (auto &MI : reverse(Loop->instrs())) { - if (MI.readsRegister(NewReg, TRI)) { - MI.addRegisterKilled(NewReg, TRI); - NewVarInfo.Kills.push_back(&MI); + // Find last use and mark as kill + MachineInstr *Kill = nullptr; + for (auto *MI : reverse(Instructions)) { + if (MI->readsRegister(NewReg, TRI)) { + MI->addRegisterKilled(NewReg, TRI); + NewVarInfo.Kills.push_back(MI); + Kill = MI; break; } } - assert(!NewVarInfo.Kills.empty() && - "Failed to find last usage of register in loop"); + assert(Kill && "Failed to find last usage of register in loop"); + + MachineBasicBlock *KillBlock = Kill->getParent(); + bool PostKillBlock = false; + for (auto *Block : Blocks) { + auto BBNum = Block->getNumber(); + + // collectWaterfallCandidateRegisters only collects registers that are dead + // after the loop. So we know that the old reg is no longer live throughout + // the waterfall loop. + OldVarInfo.AliveBlocks.reset(BBNum); + + // The new register is live up to (and including) the block that kills it. + PostKillBlock |= (Block == KillBlock); + if (PostKillBlock) { + NewVarInfo.AliveBlocks.reset(BBNum); + } else if (Block != LoopHeader) { + NewVarInfo.AliveBlocks.set(BBNum); + } + } } char SIOptimizeVGPRLiveRange::ID = 0; @@ -620,15 +662,22 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { + auto *LoopHeader = MI.getOperand(0).getMBB(); + auto *LoopEnd = &MBB; + LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " - << printMBBReference(MBB) << '\n'); + << printMBBReference(*LoopHeader) << '\n'); SmallSetVector CandidateRegs; - collectWaterfallCandidateRegisters(&MBB, CandidateRegs); + SmallVector Instructions; + SmallSetVector Blocks; + + collectWaterfallCandidateRegisters(LoopHeader, LoopEnd, CandidateRegs, + Blocks, Instructions); MadeChange |= !CandidateRegs.empty(); // Now we are safe to optimize. for (auto Reg : CandidateRegs) - optimizeWaterfallLiveRange(Reg, &MBB); + optimizeWaterfallLiveRange(Reg, LoopHeader, Blocks, Instructions); } } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 09a25d183c7307..b4bc8da9c00934 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -68,78 +68,82 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: v_writelane_b32 v15, s4, 4 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s16, v15, 0 -; CHECK-NEXT: v_readlane_b32 s17, v15, 1 -; CHECK-NEXT: v_readlane_b32 s18, v15, 2 -; CHECK-NEXT: v_readlane_b32 s19, v15, 3 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s8, v8 -; CHECK-NEXT: v_readfirstlane_b32 s23, v9 +; CHECK-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-NEXT: v_readfirstlane_b32 s19, v7 ; CHECK-NEXT: s_mov_b32 s4, s8 -; CHECK-NEXT: s_mov_b32 s5, s23 -; CHECK-NEXT: v_writelane_b32 v15, s4, 5 -; CHECK-NEXT: v_writelane_b32 v15, s5, 6 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[4:5], v[8:9] -; CHECK-NEXT: v_readfirstlane_b32 s22, v6 -; CHECK-NEXT: v_readfirstlane_b32 s21, v7 -; CHECK-NEXT: s_mov_b32 s6, s22 -; CHECK-NEXT: s_mov_b32 s7, s21 -; CHECK-NEXT: v_writelane_b32 v15, s6, 7 -; CHECK-NEXT: v_writelane_b32 v15, s7, 8 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[6:7], v[6:7] +; CHECK-NEXT: s_mov_b32 s5, s19 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[4:5], v[6:7] +; CHECK-NEXT: v_readfirstlane_b32 s18, v4 +; CHECK-NEXT: v_readfirstlane_b32 s17, v5 +; CHECK-NEXT: s_mov_b32 s6, s18 +; CHECK-NEXT: s_mov_b32 s7, s17 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[6:7], v[4:5] ; CHECK-NEXT: s_and_b32 s5, s4, s5 -; CHECK-NEXT: v_readfirstlane_b32 s20, v4 -; CHECK-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-NEXT: s_mov_b32 s10, s20 +; CHECK-NEXT: v_readfirstlane_b32 s16, v2 +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: s_mov_b32 s10, s16 ; CHECK-NEXT: s_mov_b32 s11, s7 -; CHECK-NEXT: v_writelane_b32 v15, s10, 9 -; CHECK-NEXT: v_writelane_b32 v15, s11, 10 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; CHECK-NEXT: s_and_b32 s9, s4, s5 -; CHECK-NEXT: v_readfirstlane_b32 s6, v2 -; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 +; CHECK-NEXT: v_readfirstlane_b32 s5, v1 ; CHECK-NEXT: s_mov_b32 s10, s6 ; CHECK-NEXT: s_mov_b32 s11, s5 -; CHECK-NEXT: v_writelane_b32 v15, s10, 11 -; CHECK-NEXT: v_writelane_b32 v15, s11, 12 -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s9 ; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 -; CHECK-NEXT: s_mov_b32 s9, s23 -; CHECK-NEXT: s_mov_b32 s10, s22 -; CHECK-NEXT: s_mov_b32 s11, s21 -; CHECK-NEXT: s_mov_b32 s12, s20 +; CHECK-NEXT: s_mov_b32 s9, s19 +; CHECK-NEXT: s_mov_b32 s10, s18 +; CHECK-NEXT: s_mov_b32 s11, s17 +; CHECK-NEXT: s_mov_b32 s12, s16 ; CHECK-NEXT: s_mov_b32 s13, s7 ; CHECK-NEXT: s_mov_b32 s14, s6 ; CHECK-NEXT: s_mov_b32 s15, s5 -; CHECK-NEXT: v_writelane_b32 v15, s8, 13 -; CHECK-NEXT: v_writelane_b32 v15, s9, 14 -; CHECK-NEXT: v_writelane_b32 v15, s10, 15 -; CHECK-NEXT: v_writelane_b32 v15, s11, 16 -; CHECK-NEXT: v_writelane_b32 v15, s12, 17 -; CHECK-NEXT: v_writelane_b32 v15, s13, 18 -; CHECK-NEXT: v_writelane_b32 v15, s14, 19 -; CHECK-NEXT: v_writelane_b32 v15, s15, 20 +; CHECK-NEXT: v_writelane_b32 v15, s8, 5 +; CHECK-NEXT: v_writelane_b32 v15, s9, 6 +; CHECK-NEXT: v_writelane_b32 v15, s10, 7 +; CHECK-NEXT: v_writelane_b32 v15, s11, 8 +; CHECK-NEXT: v_writelane_b32 v15, s12, 9 +; CHECK-NEXT: v_writelane_b32 v15, s13, 10 +; CHECK-NEXT: v_writelane_b32 v15, s14, 11 +; CHECK-NEXT: v_writelane_b32 v15, s15, 12 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 +; CHECK-NEXT: v_writelane_b32 v15, s4, 13 +; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: v_readlane_b32 s4, v15, 13 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s8, v15, 5 +; CHECK-NEXT: v_readlane_b32 s9, v15, 6 +; CHECK-NEXT: v_readlane_b32 s10, v15, 7 +; CHECK-NEXT: v_readlane_b32 s11, v15, 8 +; CHECK-NEXT: v_readlane_b32 s12, v15, 9 +; CHECK-NEXT: v_readlane_b32 s13, v15, 10 +; CHECK-NEXT: v_readlane_b32 s14, v15, 11 +; CHECK-NEXT: v_readlane_b32 s15, v15, 12 +; CHECK-NEXT: v_readlane_b32 s16, v15, 0 +; CHECK-NEXT: v_readlane_b32 s17, v15, 1 +; CHECK-NEXT: v_readlane_b32 s18, v15, 2 +; CHECK-NEXT: v_readlane_b32 s19, v15, 3 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_readlane_b32 s4, v15, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll index f86befc1ce17ea..2f9a6e6a17ed94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -113,7 +113,7 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -129,16 +129,20 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -168,7 +172,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -184,16 +188,20 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll index 5f5475108a4e5d..c008e0a5bb9977 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -73,7 +73,7 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -89,18 +89,22 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY10]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY13]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -132,7 +136,7 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -148,18 +152,22 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY10]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll index 42df538bcc6ca5..afc01b3378bde6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -155,7 +155,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -171,16 +171,20 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -202,7 +206,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: - ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -218,16 +222,20 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3: + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3: - ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -253,7 +261,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -269,16 +277,20 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -298,7 +310,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: - ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -314,16 +326,20 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3: + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3: - ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll index ce73b0261fa93a..14c05ec94ce743 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -163,7 +163,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -179,16 +179,20 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset @@ -209,7 +213,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -225,16 +229,20 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll index 8d1a5a9e4d303f..0441368d9905e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -113,7 +113,7 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -129,16 +129,20 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll index edae186084dd20..93d807a8d269d3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -62,7 +62,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec @@ -75,16 +75,20 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -110,7 +114,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec @@ -126,16 +130,20 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -463,7 +471,7 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec @@ -476,16 +484,20 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -511,7 +523,7 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec @@ -524,16 +536,20 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -786,7 +802,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec @@ -799,16 +815,20 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 @@ -838,7 +858,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -851,16 +871,20 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %14, [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 5628d217e28a33..0e996101cbb7ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -172,7 +172,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec @@ -185,16 +185,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -216,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -229,16 +233,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -469,7 +477,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -482,16 +490,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -516,7 +528,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -529,16 +541,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 88279c4dec76c7..b496be3c576395 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -128,7 +128,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec @@ -141,16 +141,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -295,7 +299,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -308,16 +312,20 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index fba88ca5f9e81c..5dfc64fa1b2306 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -64,7 +64,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -77,16 +77,20 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -110,21 +114,25 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -150,7 +158,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -166,16 +174,20 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -500,7 +512,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -513,16 +525,20 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -753,7 +769,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -766,16 +782,20 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -803,7 +823,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -816,16 +836,20 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll index 0c636e218316e8..75bdd845394bf2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -161,7 +161,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -177,16 +177,20 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset @@ -207,7 +211,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -223,16 +227,20 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll index d5443eb65ef809..e5ee0d2d8a4a79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -111,7 +111,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -127,16 +127,20 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll index afbda5f14e7107..36fa2bf0255d76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -140,7 +140,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -153,16 +153,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -182,7 +186,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -195,16 +199,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -230,7 +238,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -246,16 +254,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -275,7 +287,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -291,16 +303,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -327,7 +343,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -343,16 +359,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -373,7 +393,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -389,16 +409,20 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll index 45755dbbe30be0..12a4396f62cfc9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -55,7 +55,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -68,16 +68,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -97,7 +101,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -110,16 +114,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -145,7 +153,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -161,16 +169,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -190,7 +202,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -206,16 +218,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -242,7 +258,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -258,16 +274,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -288,7 +308,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -304,16 +324,20 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll index d158509e5bfcbd..55f04b88801c1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -131,7 +131,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -144,16 +144,20 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void @@ -179,7 +183,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -195,16 +199,20 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -231,7 +239,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -247,16 +255,20 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -589,7 +601,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec @@ -602,16 +614,20 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -641,7 +657,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec @@ -654,16 +670,20 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index bc759193290036..fb4d8a8faf3b92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2701,8 +2701,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2714,14 +2712,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc @@ -2741,8 +2743,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2754,14 +2754,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_rsrc @@ -2781,8 +2785,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2794,14 +2796,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -2825,8 +2831,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2838,14 +2842,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 @@ -2863,8 +2871,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2876,14 +2882,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 @@ -2901,8 +2911,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2914,14 +2922,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 @@ -2950,8 +2962,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -2963,14 +2973,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 @@ -2992,8 +3006,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3005,14 +3017,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 @@ -3034,8 +3050,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3047,14 +3061,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -3079,8 +3097,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3092,14 +3108,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 @@ -3117,8 +3137,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3130,14 +3148,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 @@ -3155,8 +3177,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3168,14 +3188,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) @@ -3201,8 +3225,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3214,14 +3236,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 @@ -3241,8 +3267,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3254,14 +3278,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 @@ -3279,8 +3307,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3292,14 +3318,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) @@ -3324,8 +3354,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3337,15 +3365,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3379,8 +3411,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3392,15 +3422,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3434,8 +3468,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3447,15 +3479,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3501,8 +3537,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3514,15 +3548,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3560,8 +3598,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3573,15 +3609,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3619,8 +3659,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3632,15 +3670,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3684,8 +3726,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3697,15 +3737,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3743,8 +3787,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3756,15 +3798,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3802,8 +3848,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3815,15 +3859,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3864,8 +3912,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3877,15 +3923,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3920,8 +3970,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3933,15 +3981,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -3976,8 +4028,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -3989,15 +4039,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 936, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 952, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4038,8 +4092,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4051,15 +4103,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4094,8 +4150,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4107,15 +4161,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4150,8 +4208,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4163,15 +4219,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4212,8 +4272,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4225,15 +4283,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4268,8 +4330,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4281,15 +4341,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4324,8 +4388,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4337,15 +4399,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4385,8 +4451,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: - ; GFX6-NEXT: successors: %bb.3, %bb.2 - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4398,15 +4462,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: successors: %bb.4, %bb.2 + ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.3: + ; GFX6-NEXT: bb.4: ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: bb.4: + ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4440,8 +4508,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: - ; GFX7-NEXT: successors: %bb.3, %bb.2 - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4453,15 +4519,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: successors: %bb.4, %bb.2 + ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.3: + ; GFX7-NEXT: bb.4: ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: bb.4: + ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 @@ -4495,8 +4565,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: - ; GFX8-NEXT: successors: %bb.3, %bb.2 - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]].sub1, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -4508,15 +4576,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: successors: %bb.4, %bb.2 + ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.3: + ; GFX8-NEXT: bb.4: ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: bb.4: + ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll index 55b8ab2c818e0d..c9884689e30c0d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -123,7 +123,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -139,17 +139,21 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -181,7 +185,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -197,17 +201,21 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll index bbf5ae80e015f7..65ca08b45d6784 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -80,7 +80,7 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]].sub1, implicit $exec @@ -96,6 +96,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) @@ -103,12 +107,12 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY15]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -142,7 +146,7 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]].sub1, implicit $exec @@ -158,6 +162,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) @@ -165,12 +173,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 186065d807416f..8ee34c3da49459 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -169,7 +169,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -185,17 +185,21 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -219,7 +223,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: - ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -235,17 +239,21 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3: + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3: - ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -273,7 +281,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -289,16 +297,20 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: bb.4: + ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset ; GFX90A: bb.1 (%ir-block.0): @@ -320,7 +332,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: - ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -336,16 +348,20 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3: + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 4) ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3: - ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.4: + ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll index a365aa15573b57..849523e11c50fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -176,7 +176,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -192,17 +192,21 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 @@ -246,7 +250,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -262,17 +266,21 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY11]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 98426e2d328bc6..a7fab9821eebf0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -123,7 +123,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -139,17 +139,21 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 36290d7cd72460..93927318961f2b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -194,7 +194,7 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -210,17 +210,21 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll index 0a9e46d1ecc725..54fc5dcd476d89 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -155,7 +155,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -171,17 +171,21 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -205,7 +209,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -221,17 +225,21 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll index 561ae60863ccfe..72ebb811fc8501 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -117,7 +117,7 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec @@ -133,17 +133,21 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll index bd1739d9d92857..bd6fad5e071de1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -123,7 +123,7 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]].sub1, implicit $exec @@ -139,17 +139,21 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index fd3c1fa6c9c09b..78a636fd22b2f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -215,7 +215,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: - ; PACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -231,17 +231,21 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED-NEXT: {{ $}} + ; PACKED-NEXT: bb.3: + ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 4) ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.3: - ; PACKED-NEXT: successors: %bb.4(0x80000000) + ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: bb.4: + ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED-NEXT: $vgpr0 = COPY [[COPY11]] @@ -267,7 +271,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: - ; UNPACKED-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -283,17 +287,21 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: bb.3: + ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 4) ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.3: - ; UNPACKED-NEXT: successors: %bb.4(0x80000000) + ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: bb.4: + ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll index 7acda239589991..3ad8ae87cf8489 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -144,7 +144,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec @@ -160,17 +160,21 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index a5508802d1762d..80383c7b2280b4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -66,9 +66,9 @@ body: | ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.2 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -80,16 +80,20 @@ body: | ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY2]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .3: + ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 @@ -117,9 +121,9 @@ body: | ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.2 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -131,16 +135,20 @@ body: | ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .3: + ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index 7ea21033312a92..0a1a0438e6770a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -117,9 +117,9 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; FAST-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: successors: %bb.3(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3 ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; FAST-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -141,16 +141,20 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; FAST-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; FAST-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: bb.3: + ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.3: - ; FAST-NEXT: successors: %bb.4(0x80000000) + ; FAST-NEXT: bb.4: + ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.4: + ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -175,9 +179,9 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: successors: %bb.3(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -199,16 +203,20 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; GREEDY-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: - ; GREEDY-NEXT: successors: %bb.4(0x80000000) + ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 @@ -241,9 +249,9 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; FAST-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: successors: %bb.3(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3 ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; FAST-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -265,16 +273,20 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; FAST-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; FAST-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: bb.3: + ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.3: - ; FAST-NEXT: successors: %bb.4(0x80000000) + ; FAST-NEXT: bb.4: + ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.4: + ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -300,9 +312,9 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: successors: %bb.3(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -324,16 +336,20 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; GREEDY-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: - ; GREEDY-NEXT: successors: %bb.4(0x80000000) + ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index 263b0c6223961e..a0ed8d7ddecb31 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -138,9 +138,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; FAST-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: successors: %bb.3(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; FAST-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -162,16 +162,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; FAST-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; FAST-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: bb.3: + ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.3: - ; FAST-NEXT: successors: %bb.4(0x80000000) + ; FAST-NEXT: bb.4: + ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.4: + ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp @@ -200,9 +204,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: successors: %bb.3(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -224,16 +228,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: - ; GREEDY-NEXT: successors: %bb.4(0x80000000) + ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -269,9 +277,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; FAST-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: successors: %bb.3(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; FAST-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -283,16 +291,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; FAST-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; FAST-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; FAST-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: bb.3: + ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.3: - ; FAST-NEXT: successors: %bb.4(0x80000000) + ; FAST-NEXT: bb.4: + ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.4: + ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp @@ -321,9 +333,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: successors: %bb.3(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -335,16 +347,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: - ; GREEDY-NEXT: successors: %bb.4(0x80000000) + ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -381,9 +397,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.2: - ; FAST-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: successors: %bb.3(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; FAST-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -416,16 +432,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc ; FAST-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) ; FAST-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST-NEXT: {{ $}} + ; FAST-NEXT: bb.3: + ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.3: - ; FAST-NEXT: successors: %bb.4(0x80000000) + ; FAST-NEXT: bb.4: + ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; FAST-NEXT: {{ $}} - ; FAST-NEXT: bb.4: + ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp @@ -455,9 +475,9 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: successors: %bb.3(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -490,16 +510,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: - ; GREEDY-NEXT: successors: %bb.4(0x80000000) + ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll index f0b403920de202..e14262c416406f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -68,9 +68,9 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -82,16 +82,20 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -118,22 +122,26 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -161,9 +169,9 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -178,16 +186,20 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 8aab7e014e3a35..57e4d40bb9c814 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1567,9 +1567,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1581,14 +1579,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc @@ -1609,9 +1611,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1623,14 +1623,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -1658,9 +1662,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1672,14 +1674,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 @@ -1701,9 +1707,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1715,14 +1719,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 @@ -1752,9 +1760,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1766,14 +1772,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 @@ -1796,9 +1806,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1810,14 +1818,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -1845,9 +1857,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1859,14 +1869,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 @@ -1887,9 +1901,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1901,14 +1913,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) @@ -1935,9 +1951,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1949,14 +1963,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 @@ -1977,9 +1995,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -1991,14 +2007,18 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) @@ -2027,9 +2047,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2041,15 +2059,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2080,9 +2102,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2094,15 +2114,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2142,9 +2166,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2156,15 +2178,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2196,9 +2222,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2210,15 +2234,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2256,9 +2284,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2270,15 +2296,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2310,9 +2340,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2324,15 +2352,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2369,9 +2401,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2383,15 +2413,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2422,9 +2456,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2436,15 +2468,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2481,9 +2517,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2495,15 +2529,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2534,9 +2572,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2548,15 +2584,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2593,9 +2633,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2607,15 +2645,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2646,9 +2688,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2660,15 +2700,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2704,9 +2748,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3, %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2718,15 +2760,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4, %bb.2 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.4: ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) @@ -2756,9 +2802,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GREEDY-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.2: - ; GREEDY-NEXT: successors: %bb.3, %bb.2 - ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2770,15 +2814,19 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GREEDY-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY-NEXT: {{ $}} + ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: successors: %bb.4, %bb.2 + ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.3: + ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: bb.4: + ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY-NEXT: $vgpr0 = COPY [[UV2]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll index 0578d30c0d0db6..56cff7cde87868 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -66,9 +66,9 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -80,16 +80,20 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -115,22 +119,26 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -157,9 +165,9 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -174,16 +182,20 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll index 30724fd1f889e0..42ee4da9fdaf31 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -68,9 +68,9 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -82,16 +82,20 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -117,22 +121,26 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -159,9 +167,9 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -176,16 +184,20 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir index 0919c3edd6208e..07c2d682fcacd4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -28,22 +28,26 @@ body: | ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.2 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .3: + ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0 %val:_(s32) = COPY $vgpr0 %rsrc:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 @@ -75,9 +79,9 @@ body: | ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %6, %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %6, %bb.2 ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -99,16 +103,20 @@ body: | ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY1]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>)) ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: .3: + ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) %0:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 %1:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 3af1aa69abc3e4..d2cc65f922e0a2 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -229,7 +229,6 @@ entry: ; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload @@ -250,6 +249,7 @@ entry: ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -268,7 +268,6 @@ entry: ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -289,6 +288,7 @@ entry: ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index 8e339008047648..0b7b4ad896c274 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -16,7 +16,7 @@ # W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec # W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W64-NEXT: successors: %bb.2({{.*}}) # W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -28,10 +28,12 @@ # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W64-LABEL: bb.2: +# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.2: +# W64-LABEL: bb.3: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] # W32-LABEL: name: idxen @@ -40,7 +42,7 @@ # W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo # W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W32-NEXT: successors: %bb.2({{.*}}) # W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -52,11 +54,13 @@ # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W32-LABEL: bb.2: +# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.2: +# W32-LABEL: bb.3: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: idxen @@ -89,7 +93,7 @@ body: | # W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec # W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W64-NEXT: successors: %bb.2({{.*}}) # W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -101,10 +105,12 @@ body: | # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W64-LABEL: bb.2: +# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.2: +# W64-LABEL: bb.3: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] # W32-LABEL: name: offen @@ -113,7 +119,7 @@ body: | # W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo # W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W32-NEXT: successors: %bb.2({{.*}}) # W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -125,11 +131,13 @@ body: | # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W32-LABEL: bb.2: +# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.2: +# W32-LABEL: bb.3: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: offen @@ -162,7 +170,7 @@ body: | # W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec # W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W64-NEXT: successors: %bb.2({{.*}}) # W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -174,10 +182,12 @@ body: | # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W64-LABEL: bb.2: +# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.2: +# W64-LABEL: bb.3: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] # W32-LABEL: name: bothen @@ -186,7 +196,7 @@ body: | # W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo # W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W32-NEXT: successors: %bb.2({{.*}}) # W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -198,11 +208,13 @@ body: | # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W32-LABEL: bb.2: +# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.2: +# W32-LABEL: bb.3: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: bothen @@ -272,7 +284,7 @@ body: | # W64-NO-ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W64-NO-ADDR64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec # W64-NO-ADDR64-LABEL: bb.1: -# W64-NO-ADDR64-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W64-NO-ADDR64-NEXT: successors: %bb.2({{.*}}) # W64-NO-ADDR64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W64-NO-ADDR64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W64-NO-ADDR64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -284,17 +296,19 @@ body: | # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64-NO-ADDR64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W64-NO-ADDR64-LABEL: bb.2: +# W64-NO-ADDR64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64-NO-ADDR64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-NO-ADDR64-LABEL: bb.2: +# W64-NO-ADDR64-LABEL: bb.3: # W64-NO-ADDR64: $exec = S_MOV_B64 [[SAVEEXEC]] # W32: successors: %bb.1({{.*}}) # W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 # W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo # W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.2({{.*}}) +# W32-NEXT: successors: %bb.2({{.*}}) # W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec # W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec # W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 @@ -306,11 +320,13 @@ body: | # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec +# W32-LABEL: bb.2: +# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.2: +# W32-LABEL: bb.3: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] # ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 4eb59202ece524..de39119700d4c1 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -218,7 +218,7 @@ for.end: define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 { ; SI-LABEL: name: loop ; SI: bb.0.main_body: - ; SI-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: successors: %bb.6(0x40000000), %bb.1(0x40000000) ; SI-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 @@ -229,16 +229,16 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: S_BRANCH %bb.6 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.Flow: - ; SI-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.7 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.7 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.7 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.7 - ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.2.if: @@ -248,15 +248,19 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: - ; SI-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.3, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI4]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4: + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] @@ -267,29 +271,33 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.4: - ; SI-NEXT: successors: %bb.8(0x80000000) + ; SI-NEXT: bb.5: + ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; SI-NEXT: S_BRANCH %bb.8 + ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.5.else: - ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: bb.6.else: + ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.6: - ; SI-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) + ; SI-NEXT: bb.7: + ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.6, [[COPY4]], %bb.5 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI6]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.8: + ; SI-NEXT: successors: %bb.7(0x40000000), %bb.9(0x40000000) + ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] @@ -298,17 +306,17 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.6, implicit $exec + ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.7: + ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.8.end: - ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 + ; SI-NEXT: bb.10.end: + ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[PHI8]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 @@ -333,7 +341,7 @@ end: define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 { ; SI-LABEL: name: loop_with_use ; SI: bb.0.main_body: - ; SI-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: successors: %bb.6(0x40000000), %bb.1(0x40000000) ; SI-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 @@ -344,15 +352,15 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: S_BRANCH %bb.6 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.Flow: - ; SI-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.7 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.7 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.7 - ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.9 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.2.if: @@ -362,14 +370,18 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: - ; SI-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI3]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4: + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] @@ -380,28 +392,32 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.4: - ; SI-NEXT: successors: %bb.8(0x80000000) + ; SI-NEXT: bb.5: + ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; SI-NEXT: S_BRANCH %bb.8 + ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.5.else: - ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: bb.6.else: + ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.6: - ; SI-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) + ; SI-NEXT: bb.7: + ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI4]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.8: + ; SI-NEXT: successors: %bb.7(0x40000000), %bb.9(0x40000000) + ; SI-NEXT: {{ $}} ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] @@ -410,17 +426,17 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.6, implicit $exec + ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.7: + ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} - ; SI-NEXT: bb.8.end: - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 + ; SI-NEXT: bb.10.end: + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: %27:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI5]], killed [[COPY4]], implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed %27 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index dac60fc774039a..b9e03a66890948 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -169,50 +169,50 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s4, exec_lo, s0 +; SI-NEXT: s_xor_b32 s6, exec_lo, s0 ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB3_2: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] -; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz .LBB3_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB3_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s4, s4 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; SI-NEXT: s_or_saveexec_b32 s6, s6 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6 ; SI-NEXT: s_cbranch_execz .LBB3_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB3_6: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] -; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz .LBB3_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: .LBB3_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: ; return to shader part epilog main_body: @@ -247,47 +247,47 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s4, exec_lo, s0 +; SI-NEXT: s_xor_b32 s6, exec_lo, s0 ; SI-NEXT: s_cbranch_execz .LBB4_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] -; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz .LBB4_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB4_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s4, s4 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; SI-NEXT: s_or_saveexec_b32 s6, s6 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s6 ; SI-NEXT: s_cbranch_execz .LBB4_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s5, exec_lo +; SI-NEXT: s_mov_b32 s7, exec_lo ; SI-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] ; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 ; SI-NEXT: s_mov_b64 s[0:1], s[12:13] ; SI-NEXT: s_mov_b64 s[2:3], s[14:15] -; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz .LBB4_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s5 +; SI-NEXT: s_mov_b32 exec_lo, s7 ; SI-NEXT: .LBB4_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; SI-NEXT: v_add_f32_e32 v0, v0, v40 ; SI-NEXT: ; return to shader part epilog main_body: