diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 3e95c55df57e8a..b77499e0fee9eb 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -82,9 +82,47 @@ static cl::opt EnableM0Merge( cl::init(true)); namespace { + +class V2SCopyInfo { +public: + // VGPR to SGPR copy being processed + MachineInstr *Copy; + // All SALU instructions reachable from this copy in SSA graph + DenseSet SChain; + // Number of SGPR to VGPR copies that are used to put the SALU computation + // results back to VALU. + unsigned NumSVCopies; + + unsigned Score; + // Actual count of v_readfirstlane_b32 + // which need to be inserted to keep SChain SALU + unsigned NumReadfirstlanes; + // Current score state. To speedup selection V2SCopyInfos for processing + bool NeedToBeConvertedToVALU = false; + // Unique ID. Used as a key for mapping to keep permanent order. + unsigned ID; + + // Count of another VGPR to SGPR copies that contribute to the + // current copy SChain + unsigned SiblingPenalty = 0; + SetVector Siblings; + V2SCopyInfo() : Copy(nullptr), ID(0){}; + V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) + : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() + << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty + << "\nScore: " << Score << "\n"; + } +#endif +}; + class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; unsigned NextVGPRToSGPRCopyID; + DenseMap V2SCopies; + DenseMap> SiblingPenalty; public: static char ID; @@ -97,6 +135,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } + void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info); void lowerVGPR2SGPRCopies(MachineFunction &MF); // Handles copies which source register is: // 1. Physical register @@ -104,7 +143,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { // 3. Defined by the instruction the merely moves the immediate bool lowerSpecialCase(MachineInstr &MI); - MachineBasicBlock *processPHINode(MachineInstr &MI); + void processPHINode(MachineInstr &MI); StringRef getPassName() const override { return "SI Fix SGPR copies"; } @@ -663,15 +702,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } case AMDGPU::PHI: { - MachineBasicBlock *NewBB = processPHINode(MI); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); + processPHINode(MI); break; } case AMDGPU::REG_SEQUENCE: { @@ -681,17 +712,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; } - LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); break; } case AMDGPU::INSERT_SUBREG: { @@ -783,58 +803,42 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); + SiblingPenalty.clear(); + V2SCopies.clear(); + return true; } -MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { - unsigned numVGPRUses = 0; +void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { bool AllAGPRUses = true; SetVector worklist; SmallSet Visited; SetVector PHIOperands; - MachineBasicBlock *CreatedBB = nullptr; worklist.insert(&MI); Visited.insert(&MI); + // HACK to make MIR tests with no uses happy + bool HasUses = false; while (!worklist.empty()) { const MachineInstr *Instr = worklist.pop_back_val(); Register Reg = Instr->getOperand(0).getReg(); for (const auto &Use : MRI->use_operands(Reg)) { + HasUses = true; const MachineInstr *UseMI = Use.getParent(); AllAGPRUses &= (UseMI->isCopy() && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || TRI->isAGPR(*MRI, Use.getReg()); if (UseMI->isCopy() || UseMI->isRegSequence()) { - if (UseMI->isCopy() && - UseMI->getOperand(0).getReg().isPhysical() && - !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { - numVGPRUses++; - } if (Visited.insert(UseMI).second) worklist.insert(UseMI); continue; } - - if (UseMI->isPHI()) { - const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); - if (!TRI->isSGPRReg(*MRI, Use.getReg()) && - UseRC != &AMDGPU::VReg_1RegClass) - numVGPRUses++; - continue; - } - - const TargetRegisterClass *OpRC = - TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); - if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && - OpRC != &AMDGPU::VS_64RegClass) { - numVGPRUses++; - } } } Register PHIRes = MI.getOperand(0).getReg(); const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); - if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) { + if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) { LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { @@ -844,49 +848,8 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } - bool hasVGPRInput = false; - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - Register InputReg = MI.getOperand(i).getReg(); - MachineInstr *Def = MRI->getVRegDef(InputReg); - if (TRI->isVectorRegister(*MRI, InputReg)) { - if (Def->isCopy()) { - Register SrcReg = Def->getOperand(1).getReg(); - const TargetRegisterClass *RC = - TRI->getRegClassForReg(*MRI, SrcReg); - if (TRI->isSGPRClass(RC)) - continue; - } - hasVGPRInput = true; - break; - } - else if (Def->isCopy() && - TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { - Register SrcReg = Def->getOperand(1).getReg(); - MachineInstr *SrcDef = MRI->getVRegDef(SrcReg); - unsigned SMovOp; - int64_t Imm; - if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) { - hasVGPRInput = true; - break; - } else { - // Formally, if we did not do this right away - // it would be done on the next iteration of the - // runOnMachineFunction main loop. But why not if we can? - MachineFunction *MF = MI.getParent()->getParent(); - Def->getOperand(1).ChangeToImmediate(Imm); - Def->addImplicitDefUseOperands(*MF); - Def->setDesc(TII->get(SMovOp)); - } - } - } - - if ((!TRI->isVectorRegister(*MRI, PHIRes) && - RC0 != &AMDGPU::VReg_1RegClass) && - (hasVGPRInput || numVGPRUses > 1)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - CreatedBB = TII->moveToVALU(MI); - } - else { + if (TRI->isVectorRegister(*MRI, PHIRes) || + RC0 == &AMDGPU::VReg_1RegClass) { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); TII->legalizeOperands(MI, MDT); } @@ -895,18 +858,9 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { while (!PHIOperands.empty()) { processPHINode(*PHIOperands.pop_back_val()); } - return CreatedBB; } bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { - MachineBasicBlock *MBB = MI.getParent(); - const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); - - // We return true to indicate that no further processing needed - if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) - return true; - Register SrcReg = MI.getOperand(1).getReg(); if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { TII->moveToVALU(MI, MDT); @@ -919,73 +873,89 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { // s_mov_b32. if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(*MBB->getParent()); + MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); MI.setDesc(TII->get(SMovOp)); return true; } return false; } -class V2SCopyInfo { -public: - // VGPR to SGPR copy being processed - MachineInstr *Copy; - // All SALU instructions reachable from this copy in SSA graph - DenseSet SChain; - // Number of SGPR to VGPR copies that are used to put the SALU computation - // results back to VALU. - unsigned NumSVCopies; - unsigned Score; - // Actual count of v_readfirstlane_b32 - // which need to be inserted to keep SChain SALU - unsigned NumReadfirstlanes; - // Current score state. To speedup selection V2SCopyInfos for processing - bool NeedToBeConvertedToVALU = false; - // Unique ID. Used as a key for mapping to keep permanent order. - unsigned ID; - // Count of another VGPR to SGPR copies that contribute to the - // current copy SChain - unsigned SiblingPenalty = 0; - SetVector Siblings; - V2SCopyInfo() : Copy(nullptr), ID(0){}; - V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) - : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump() { - dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() - << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty - << "\nScore: " << Score << "\n"; +void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) { + SmallVector AnalysisWorklist; + // Needed because the SSA is not a tree but a graph and may have + // forks and joins. We should not then go same way twice. + DenseSet Visited; + AnalysisWorklist.push_back(Info.Copy); + while (!AnalysisWorklist.empty()) { + + MachineInstr *Inst = AnalysisWorklist.pop_back_val(); + + if (!Visited.insert(Inst).second) + continue; + + // Copies and REG_SEQUENCE do not contribute to the final assembly + // So, skip them but take care of the SGPR to VGPR copies bookkeeping. + if (Inst->isCopy() || Inst->isRegSequence()) { + if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + if (!Inst->isCopy() || + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + Info.NumSVCopies++; + continue; + } + } + } + + SiblingPenalty[Inst].insert(Info.ID); + + SmallVector Users; + if ((TII->isSALU(*Inst) && Inst->isCompare()) || + (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { + auto I = Inst->getIterator(); + auto E = Inst->getParent()->end(); + while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { + if (I->readsRegister(AMDGPU::SCC)) + Users.push_back(&*I); + } + } else if (Inst->getNumExplicitDefs() != 0) { + Register Reg = Inst->getOperand(0).getReg(); + if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) + for (auto &U : MRI->use_instructions(Reg)) + Users.push_back(&U); + } + for (auto U : Users) { + if (TII->isSALU(*U)) + Info.SChain.insert(U); + AnalysisWorklist.push_back(U); + } } -#endif -}; +} void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { - DenseMap Copies; - DenseMap> SiblingPenalty; - // The main function that computes the VGPR to SGPR copy score // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { - if (I->SChain.empty()) + if (I->SChain.empty()) { + I->Score = 0; return true; + } I->Siblings = SiblingPenalty[*std::max_element( I->SChain.begin(), I->SChain.end(), [&](MachineInstr *A, MachineInstr *B) -> bool { return SiblingPenalty[A].size() < SiblingPenalty[B].size(); })]; I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); - // The loop below computes the number of another VGPR to SGPR copies + // The loop below computes the number of another VGPR to SGPR V2SCopies // which contribute to the current copy SALU chain. We assume that all the - // copies with the same source virtual register will be squashed to one by - // regalloc. Also we take careof the copies of the differnt subregs of the - // same register. + // V2SCopies with the same source virtual register will be squashed to one + // by regalloc. Also we take care of the V2SCopies of the differnt subregs + // of the same register. SmallSet, 4> SrcRegs; for (auto J : I->Siblings) { - auto InfoIt = Copies.find(J); - if (InfoIt != Copies.end()) { + auto InfoIt = V2SCopies.find(J); + if (InfoIt != V2SCopies.end()) { MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; if (SiblingCopy->isImplicitDef()) // the COPY has already been MoveToVALUed @@ -1012,97 +982,106 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { case AMDGPU::STRICT_WQM: case AMDGPU::SOFT_WQM: case AMDGPU::STRICT_WWM: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: return true; default: return false; } }; + SmallSet OutOfOrderProcessedCopies; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *MBB = &*BI; for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { - MachineInstr &MI = *I; - if (!needProcessing(MI)) - continue; - if (lowerSpecialCase(MI)) + MachineInstr *MI = &*I; + if (!needProcessing(*MI)) continue; - // Compute the COPY width to pass it to V2SCopyInfo Ctor - Register DstReg = MI.getOperand(0).getReg(); + if (MI->isRegSequence() || MI->isPHI()) { + MachineBasicBlock::iterator J = I; + if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); + if (TRI->hasVectorRegisters(SrcRC)) { + const TargetRegisterClass *DestRC = + TRI->getEquivalentSGPRClass(SrcRC); + Register NewDst = MRI->createVirtualRegister(DestRC); + MachineBasicBlock *BlockToInsertCopy = MBB; + MachineBasicBlock::iterator PointToInsertCopy = I; + if (MI->isPHI()) { + BlockToInsertCopy = + MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB(); + PointToInsertCopy = + BlockToInsertCopy->getFirstInstrTerminator(); + } + MachineBasicBlock::iterator NewI = + BuildMI(*BlockToInsertCopy, PointToInsertCopy, + PointToInsertCopy->getDebugLoc(), + TII->get(AMDGPU::COPY), NewDst) + .addReg(MO.getReg()); + MO.setReg(NewDst); + if (!MI->isPHI()) { + I = NewI; + MI = &*I; + } else { + // We insert the copy into the basic block that may have been + // already processed. Pass it to the analysis explicitly. + V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, + TRI->getRegSizeInBits(*DestRC)); + analyzeVGPRToSGPRCopy(In); + V2SCopies[In.ID] = In; + OutOfOrderProcessedCopies.insert(MI); + } + } + } + } - const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); + if (J == I) + continue; + } - V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI, - TRI->getRegSizeInBits(*DstRC)); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); - SmallVector AnalysisWorklist; - // Needed because the SSA is not a tree but a graph and may have - // forks and joins. We should not then go same way twice. - DenseSet Visited; - AnalysisWorklist.push_back(&MI); - while (!AnalysisWorklist.empty()) { + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + continue; - MachineInstr *Inst = AnalysisWorklist.pop_back_val(); + if (lowerSpecialCase(*MI)) + continue; - if (!Visited.insert(Inst).second) - continue; + if (OutOfOrderProcessedCopies.contains(MI)) + continue; - // Copies and REG_SEQUENCE do not contribute to the final assembly - // So, skip them but take care of the SGPR to VGPR copies bookkeeping. - if (Inst->isCopy() || Inst->isRegSequence()) { - if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { - if (!Inst->isCopy() || - !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { - In.NumSVCopies++; - continue; - } - } - } + V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, + TRI->getRegSizeInBits(*DstRC)); - SiblingPenalty[Inst].insert(In.ID); + analyzeVGPRToSGPRCopy(In); - SmallVector Users; - if ((TII->isSALU(*Inst) && Inst->isCompare()) || - (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { - auto I = Inst->getIterator(); - auto E = Inst->getParent()->end(); - while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { - if (I->readsRegister(AMDGPU::SCC)) - Users.push_back(&*I); - } - } else if (Inst->getNumExplicitDefs() != 0) { - Register Reg = Inst->getOperand(0).getReg(); - if (TRI->isSGPRReg(*MRI, Reg)) - for (auto &U : MRI->use_instructions(Reg)) - Users.push_back(&U); - } - for (auto U : Users) { - if (TII->isSALU(*U)) - In.SChain.insert(U); - AnalysisWorklist.push_back(U); - } - } - Copies[In.ID] = In; + V2SCopies[In.ID] = In; } } SmallVector LoweringWorklist; - for (auto &C : Copies) { + for (auto &C : V2SCopies) { if (needToBeConvertedToVALU(&C.second)) LoweringWorklist.push_back(C.second.ID); } while (!LoweringWorklist.empty()) { unsigned CurID = LoweringWorklist.pop_back_val(); - auto CurInfoIt = Copies.find(CurID); - if (CurInfoIt != Copies.end()) { + auto CurInfoIt = V2SCopies.find(CurID); + if (CurInfoIt != V2SCopies.end()) { V2SCopyInfo C = CurInfoIt->getSecond(); LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); for (auto S : C.Siblings) { - auto SibInfoIt = Copies.find(S); - if (SibInfoIt != Copies.end()) { + auto SibInfoIt = V2SCopies.find(S); + if (SibInfoIt != V2SCopies.end()) { V2SCopyInfo &SI = SibInfoIt->getSecond(); LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); if (!SI.NeedToBeConvertedToVALU) { @@ -1115,13 +1094,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); - Copies.erase(C.ID); + V2SCopies.erase(C.ID); TII->moveToVALU(*C.Copy, MDT); } } // Now do actual lowering - for (auto C : Copies) { + for (auto C : V2SCopies) { MachineInstr *MI = C.second.Copy; MachineBasicBlock *MBB = MI->getParent(); // We decide to turn V2S copy to v_readfirstlane_b32 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index ad3b0d08ca0d66..bce6ba45c31219 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 @@ -1520,7 +1520,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 @@ -1531,7 +1531,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 @@ -1837,7 +1837,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_sub_i32 s0, 0, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 @@ -1887,7 +1887,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: s_add_i32 s1, s6, s0 ; GFX6-NEXT: s_xor_b32 s5, s5, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 ; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 ; GFX6-NEXT: s_xor_b32 s3, s0, s3 @@ -1909,7 +1909,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 @@ -3021,7 +3021,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s6, s4 @@ -3050,7 +3050,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -3273,7 +3273,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s9, s6, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: s_lshr_b32 s6, s7, 16 ; GFX6-NEXT: s_ashr_i32 s7, s5, 16 @@ -3328,7 +3328,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX6-NEXT: s_lshr_b32 s4, s5, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 @@ -3626,7 +3626,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3710,7 +3710,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_lshr_b32 s3, s4, 8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -4196,7 +4196,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 @@ -4212,7 +4212,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 @@ -4225,7 +4225,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4403,7 +4403,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s9, s6, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 @@ -4439,7 +4439,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4826,7 +4826,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_lshr_b32 s4, s6, 15 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 @@ -5029,7 +5029,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5042,7 +5042,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 @@ -5234,7 +5234,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: v_mov_b32_e32 v7, s5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 ; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 @@ -5275,7 +5275,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: s_lshr_b32 s7, s8, 15 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s7 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 @@ -5550,7 +5550,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -5669,9 +5669,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 @@ -6000,9 +6000,9 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 @@ -6187,7 +6187,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 % ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -7079,7 +7079,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 @@ -7117,9 +7117,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -7467,7 +7467,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7752,7 +7752,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7787,8 +7787,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 @@ -8198,9 +8198,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -8852,7 +8852,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: s_addc_u32 s1, s3, s10 ; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -9160,9 +9160,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -9280,9 +9280,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 @@ -9982,7 +9982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -10354,7 +10354,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -10468,7 +10468,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 535173bb0f9ed5..5077ddf894c318 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -13,17 +13,17 @@ define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_and_b64 s[0:1], exec, -1 +; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: .LBB0_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: ds_write_b32 v0, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 -; GCN-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-NEXT: s_add_i32 s0, s0, 4 +; GCN-NEXT: s_mov_b64 vcc, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN-NEXT: .LBB0_3: ; %for.exit ; GCN-NEXT: s_endpgm @@ -92,15 +92,15 @@ define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nou ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: .LBB1_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: ds_write_b32 v0, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NEXT: s_add_i32 s0, s0, 4 ; GCN-NEXT: s_branch .LBB1_1 ; ; GCN_DBG-LABEL: loop_const_true: @@ -291,23 +291,23 @@ define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_addk_i32 s2, 0x80 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GCN-NEXT: s_add_i32 s0, s4, 0x80 +; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN-NEXT: .LBB4_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: ds_write_b32 v0, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 -; GCN-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-NEXT: s_add_i32 s0, s0, 4 +; GCN-NEXT: s_mov_b64 vcc, vcc ; GCN-NEXT: s_cbranch_vccz .LBB4_1 ; GCN-NEXT: ; %bb.2: ; %for.exit ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 0297ddbcb04acb..4a7a328d33c614 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -975,7 +975,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index 02966a2a5942f7..84b315b8003192 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64 @@ -12,58 +13,7 @@ # on non-ADDR64 hardware, we emit a waterfall loop. -# W64-LABEL: name: idxen -# W64-LABEL: bb.0: -# W64-NEXT: successors: %bb.1({{.*}}) -# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec -# W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.2({{.*}}) -# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc -# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-LABEL: bb.2: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.3: -# W64: $exec = S_MOV_B64 [[SAVEEXEC]] - -# W32-LABEL: name: idxen -# W32-LABEL: bb.0: -# W32-NEXT: successors: %bb.1({{.*}}) -# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo -# W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.2({{.*}}) -# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc -# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32-LABEL: bb.2: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` -# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.3: -# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: idxen liveins: @@ -76,6 +26,100 @@ liveins: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-LABEL: name: idxen + ; W64: successors: %bb.1(0x80000000) + ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .1: + ; W64-NEXT: successors: %bb.2(0x80000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .2: + ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .3: + ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] + ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W32-LABEL: name: idxen + ; W32: successors: %bb.1(0x80000000) + ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; W32-NEXT: {{ $}} + ; W32-NEXT: .1: + ; W32-NEXT: successors: %bb.2(0x80000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .2: + ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .3: + ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vgpr_32 = COPY $vgpr4 %3:vgpr_32 = COPY $vgpr3 @@ -89,58 +133,7 @@ body: | S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ... -# W64-LABEL: name: offen -# W64-LABEL: bb.0: -# W64-NEXT: successors: %bb.1({{.*}}) -# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec -# W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.2({{.*}}) -# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc -# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-LABEL: bb.2: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.3: -# W64: $exec = S_MOV_B64 [[SAVEEXEC]] -# W32-LABEL: name: offen -# W32-LABEL: bb.0: -# W32-NEXT: successors: %bb.1({{.*}}) -# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo -# W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.2({{.*}}) -# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc -# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32-LABEL: bb.2: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# TODO: S_XOR_B32_term should be `implicit-def $scc` -# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.3: -# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: offen liveins: @@ -153,6 +146,100 @@ liveins: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-LABEL: name: offen + ; W64: successors: %bb.1(0x80000000) + ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .1: + ; W64-NEXT: successors: %bb.2(0x80000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .2: + ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .3: + ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] + ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W32-LABEL: name: offen + ; W32: successors: %bb.1(0x80000000) + ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; W32-NEXT: {{ $}} + ; W32-NEXT: .1: + ; W32-NEXT: successors: %bb.2(0x80000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .2: + ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .3: + ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vgpr_32 = COPY $vgpr4 %3:vgpr_32 = COPY $vgpr3 @@ -166,58 +253,7 @@ body: | S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ... -# W64-LABEL: name: bothen -# W64-LABEL: bb.0: -# W64-NEXT: successors: %bb.1({{.*}}) -# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec -# W64-LABEL: bb.1: -# W64-NEXT: successors: %bb.2({{.*}}) -# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc -# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-LABEL: bb.2: -# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-LABEL: bb.3: -# W64: $exec = S_MOV_B64 [[SAVEEXEC]] -# W32-LABEL: name: bothen -# W32-LABEL: bb.0: -# W32-NEXT: successors: %bb.1({{.*}}) -# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo -# W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.2({{.*}}) -# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc -# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32-LABEL: bb.2: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# TODO: S_XOR_B32_term should be `implicit-def $scc` -# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.3: -# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- name: bothen liveins: @@ -230,6 +266,100 @@ liveins: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-LABEL: name: bothen + ; W64: successors: %bb.1(0x80000000) + ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .1: + ; W64-NEXT: successors: %bb.2(0x80000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .2: + ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W64-NEXT: {{ $}} + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: {{ $}} + ; W64-NEXT: .3: + ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] + ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W32-LABEL: name: bothen + ; W32: successors: %bb.1(0x80000000) + ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; W32-NEXT: {{ $}} + ; W32-NEXT: .1: + ; W32-NEXT: successors: %bb.2(0x80000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .2: + ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .3: + ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vreg_64 = COPY $vgpr4_vgpr5 %3:vgpr_32 = COPY $vgpr3 @@ -243,17 +373,7 @@ body: | S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ... -# ADDR64-LABEL: name: addr64 -# ADDR64-LABEL: bb.0: -# ADDR64: %14:vreg_64 = COPY %8.sub0_sub1 -# ADDR64: %15:sreg_64 = S_MOV_B64 0 -# ADDR64: %16:sgpr_32 = S_MOV_B32 0 -# ADDR64: %17:sgpr_32 = S_MOV_B32 61440 -# ADDR64: %18:sgpr_128 = REG_SEQUENCE %15, %subreg.sub0_sub1, %16, %subreg.sub2, %17, %subreg.sub3 -# ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_CO_U32_e64 %14.sub0, %4.sub0, 0, implicit $exec -# ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec -# ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, implicit $exec + --- name: addr64 liveins: @@ -266,6 +386,64 @@ liveins: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; ADDR64-LABEL: name: addr64 + ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; ADDR64-NEXT: {{ $}} + ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 + ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W32-LABEL: name: addr64 + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696 + ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vreg_64 = COPY $vgpr4_vgpr5 %3:vgpr_32 = COPY $vgpr3 @@ -279,66 +457,7 @@ body: | S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ... -# W64-LABEL: name: offset -# W64-LABEL: bb.0: - -# W64-NO-ADDR64: successors: %bb.1({{.*}}) -# W64-NO-ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W64-NO-ADDR64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec -# W64-NO-ADDR64-LABEL: bb.1: -# W64-NO-ADDR64-NEXT: successors: %bb.2({{.*}}) -# W64-NO-ADDR64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W64-NO-ADDR64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W64-NO-ADDR64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W64-NO-ADDR64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W64-NO-ADDR64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W64-NO-ADDR64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W64-NO-ADDR64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W64-NO-ADDR64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc -# W64-NO-ADDR64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-NO-ADDR64-LABEL: bb.2: -# W64-NO-ADDR64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64-NO-ADDR64: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W64-NO-ADDR64-LABEL: bb.3: -# W64-NO-ADDR64: $exec = S_MOV_B64 [[SAVEEXEC]] - -# W32: successors: %bb.1({{.*}}) -# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo -# W32-LABEL: bb.1: -# W32-NEXT: successors: %bb.2({{.*}}) -# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec -# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec -# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1 -# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec -# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec -# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec -# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1 -# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec -# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc -# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 -# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32-LABEL: bb.2: -# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}}) -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec -# TODO: S_XOR_B32_term should be `implicit-def $scc` -# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec -# W32-LABEL: bb.3: -# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] -# ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# ADDR64: [[RSRCPTR:%[0-9]+]]:vreg_64 = COPY [[VRSRC]].sub0_sub1 -# ADDR64: [[ZERO64:%[0-9]+]]:sreg_64 = S_MOV_B64 0 -# ADDR64: [[RSRCFMTLO:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 -# ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 -# ADDR64: [[ZERORSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3 -# ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, implicit $exec --- name: offset @@ -352,6 +471,127 @@ liveins: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; ADDR64-LABEL: name: offset + ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; ADDR64-NEXT: {{ $}} + ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 + ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1 + ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W64-NO-ADDR64-LABEL: name: offset + ; W64-NO-ADDR64: successors: %bb.1(0x80000000) + ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NO-ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W64-NO-ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NO-ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NO-ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NO-ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NO-ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: .1: + ; W64-NO-ADDR64-NEXT: successors: %bb.2(0x80000000) + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W64-NO-ADDR64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: .2: + ; W64-NO-ADDR64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NO-ADDR64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NO-ADDR64-NEXT: {{ $}} + ; W64-NO-ADDR64-NEXT: .3: + ; W64-NO-ADDR64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] + ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; W32-LABEL: name: offset + ; W32: successors: %bb.1(0x80000000) + ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; W32-NEXT: {{ $}} + ; W32-NEXT: .1: + ; W32-NEXT: successors: %bb.2(0x80000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec + ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .2: + ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; W32-NEXT: {{ $}} + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: {{ $}} + ; W32-NEXT: .3: + ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] + ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vreg_64 = COPY $vgpr4_vgpr5 %3:vgpr_32 = COPY $vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 611c5bb3271ba9..0045fa68c4f098 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -1,25 +1,8 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # Check that constant is in SGPR registers -# GCN-LABEL: {{^}}name: const_to_sgpr{{$}} -# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 -# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 -# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1 -# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec - - -# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}} -# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 -# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 -# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1 -# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec -# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec - -# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}} -# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1 -# GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit $exec - --- | define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) { bb: @@ -99,6 +82,44 @@ liveins: - { reg: '$vgpr0', virtual-reg: '%2' } - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } body: | + ; GCN-LABEL: name: const_to_sgpr + ; GCN: bb.0.bb: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit $exec + ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U64_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1.bb1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_LSHL_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_B64_e64 [[REG_SEQUENCE]], 2, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440 + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE3]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE4]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2.bb2: + ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) liveins: $vgpr0, $sgpr0_sgpr1 @@ -197,6 +218,50 @@ liveins: - { reg: '$vgpr0', virtual-reg: '%2' } - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } body: | + ; GCN-LABEL: name: const_to_sgpr_multiple_use + ; GCN: bb.0.bb: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 13, 0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_U32 [[REG_SEQUENCE]].sub0, [[S_LOAD_DWORDX2_IMM1]].sub0, implicit-def $scc + ; GCN-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32_xm0 = S_ADDC_U32 0, [[S_LOAD_DWORDX2_IMM1]].sub1, implicit-def dead $scc, implicit $scc + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1 + ; GCN-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32_xm0 = S_ADD_U32 [[REG_SEQUENCE]].sub0, [[S_LOAD_DWORDX2_IMM2]].sub0, implicit-def $scc + ; GCN-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32_xm0 = S_ADDC_U32 0, [[S_LOAD_DWORDX2_IMM2]].sub1, implicit-def dead $scc, implicit $scc + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1048576, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], implicit $exec + ; GCN-NEXT: [[V_CMP_LT_U64_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 killed [[V_CMP_LT_U64_e64_]], killed [[V_CMP_LT_U64_e64_1]], implicit-def dead $scc + ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_AND_B64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1.bb1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[REG_SEQUENCE]], 2, implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440 + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_2]], %subreg.sub0, killed [[S_MOV_B32_1]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE4]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LSHL_B64_]] + ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_2]], killed [[COPY3]], killed [[REG_SEQUENCE5]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2.bb2: + ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) liveins: $vgpr0, $sgpr0_sgpr1 @@ -294,6 +359,41 @@ liveins: - { reg: '$vgpr0', virtual-reg: '%2' } - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } body: | + ; GCN-LABEL: name: const_to_sgpr_subreg + ; GCN: bb.0.bb: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec + ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1.bb1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_LSHL_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_B64_e64 [[REG_SEQUENCE]], 2, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440 + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE2]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2.bb2: + ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) liveins: $vgpr0, $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4ff74b63ebce79..afd429dd079e8f 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -97,9 +97,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s10, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -494,7 +494,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -628,7 +628,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -664,7 +664,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -694,7 +694,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -731,7 +731,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -761,7 +761,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -828,7 +828,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -864,7 +864,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 ; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 @@ -926,7 +926,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 @@ -1854,7 +1854,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 % ; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index d57bd390ab999b..62754db64e0e78 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -3,11 +3,11 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-LABEL: name: test_spill_av_class - ; GCN: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0 + ; GCN: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %22.sub0 ; GCN-NEXT: undef [[AV_REG:%[0-9]+]].sub0:av_64 = COPY %{{[0-9]+}}.sub0 ; GCN-NEXT: SI_SPILL_AV64_SAVE [[AV_REG]], %stack.0, $sgpr32, 0, implicit $exec ; GCN: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec - ; GCN-NEXT: undef %22.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0 + ; GCN-NEXT: undef %23.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 8398cf0fe89861..e9472a865fd9ab 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -375,8 +375,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e32_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_ADD_I32_24]], [[V_OR_B32_e32_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e32_67]], implicit $exec - ; CHECK-NEXT: undef %692.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %692, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource") + ; CHECK-NEXT: undef %693.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 %693, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource") ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 1cfd336196c445..1c623549ffeeda 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -202,8 +202,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -505,7 +505,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -544,7 +544,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -669,7 +669,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -701,7 +701,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -740,7 +740,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -772,7 +772,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -810,7 +810,7 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 @@ -910,7 +910,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -950,8 +950,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s15, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0 @@ -1078,15 +1078,13 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB8_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v2, s9, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s9, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s10, v1 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1094,6 +1092,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -1237,8 +1236,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1337,7 +1336,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc @@ -1444,8 +1443,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc @@ -1994,7 +1993,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 % ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 6161fed685980c..0c9bf2ac2f7687 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -671,7 +671,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] ; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 +; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] @@ -1851,14 +1851,14 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; SI-NEXT: v_mul_lo_u32 v4, v4, v1 ; SI-NEXT: v_mul_hi_u32 v4, v1, v4 -; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_mul_hi_u32 v1, v2, v1 ; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 ; SI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 @@ -2352,7 +2352,7 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -2500,8 +2500,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_mul_hi_u32 v4, v2, s4 ; SI-NEXT: v_mul_lo_u32 v6, v3, s4 ; SI-NEXT: v_mul_lo_u32 v5, v2, s4 -; SI-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_mul_hi_u32 v7, v2, v5 ; SI-NEXT: v_mul_lo_u32 v6, v2, v4 ; SI-NEXT: v_mul_hi_u32 v8, v2, v4 @@ -2598,8 +2598,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_cvt_u32_f32_e32 v7, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2685,8 +2685,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index a59b5d011acf4d..861843867735cf 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -86,9 +86,9 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -723,9 +723,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 @@ -953,7 +953,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 @@ -1374,7 +1374,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -1547,7 +1547,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 ; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 3d21601f8d20a5..e0a6fd977c1a1c 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -202,8 +202,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s10, -1 @@ -941,7 +941,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index eac4fd0563cac2..39c717ae9a19d1 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -234,10 +234,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %43:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -250,8 +250,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -287,8 +287,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* % ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -357,9 +357,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %44:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -372,7 +372,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -408,7 +408,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float( ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -475,9 +475,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %50, 0, implicit $exec - ; SI-NEXT: %43:vgpr_32, dead %45:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %43, %subreg.sub1 + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec + ; SI-NEXT: %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) @@ -503,14 +503,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %51:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -576,8 +576,8 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex. ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 3, killed [[REG_SEQUENCE]], implicit $exec ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec - ; SI-NEXT: %69:vgpr_32, dead %71:sreg_32_xm0_xexec = V_ADDC_U32_e64 killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %69, %subreg.sub1 + ; SI-NEXT: %85:vgpr_32, dead %87:sreg_32_xm0_xexec = V_ADDC_U32_e64 killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %85, %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) ; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_]], 16, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.6 + 16, addrspace 4) ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3 @@ -596,17 +596,27 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex. ; SI-NEXT: bb.2: ; SI-NEXT: successors: %bb.3(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; SI-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; SI-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; SI-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec + ; SI-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc + ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec + ; SI-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc + ; SI-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_4]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub7 + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) @@ -616,32 +626,22 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex. ; SI-NEXT: bb.4: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE7]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec - ; SI-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_2]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec - ; SI-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_4]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec + ; SI-NEXT: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec ; SI-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1 - ; SI-NEXT: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec - ; SI-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_2]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc - ; SI-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_4]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_8]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub7 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec + ; SI-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc + ; SI-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_8]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub3 ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_3]], implicit-def $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, killed [[REG_SEQUENCE10]], [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource") + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, [[REG_SEQUENCE7]], killed [[REG_SEQUENCE10]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource") ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 16de2cbe761f3d..29f8c60ad281b4 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -431,9 +431,12 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 @@ -458,6 +461,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -465,6 +469,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s35, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 @@ -578,6 +583,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 @@ -712,6 +720,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff @@ -728,6 +739,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 @@ -751,6 +765,13 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10