diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 6f6ad5cf82cae..244d58c2fd081 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) + MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) + MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + SgprDstParts.push_back( + buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); + auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i)); + SmallVector Unmerge64Parts; + Unmerge64Parts.push_back( + buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0)); + Unmerge64Parts.push_back( + buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0)); + Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0); + MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID)); + SgprDstParts.push_back(MergeReg); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT S256 = LLT::scalar(256); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT Ty = SgprDst.getLLTTy(MRI); + + if (Ty == S16) { + return B.buildTrunc( + SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI)); + } + + if (Ty == S32 || Ty == V2S16 || + (Ty.isPointer() && Ty.getSizeInBits() == 32)) { + return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI); + } + + if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || + (Ty.isVector() && Ty.getElementType() == S32)) { + return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S16) { + return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S64) { + return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); + } + + llvm_unreachable("Type not supported"); +} + +void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst); + if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID)) + return; + + Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst)); + MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + + MI.getOperand(0).setReg(VgprDst); + MachineBasicBlock *MBB = MI.getParent(); + B.setInsertPt(*MBB, std::next(MI.getIterator())); + // readAnyLane VgprDst into Dst after MI. + buildReadAnyLane(B, Dst, VgprDst, RBI); + return; +} + +bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == VCCRegBankID) + return true; + + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (RC && TRI->isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1)) + return true; + + return false; +} + +bool AMDGPU::isSgprRB(Register Reg, MachineRegisterInfo &MRI) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == SGPRRegBankID) + return true; + + return false; +} + +bool AMDGPU::isVgprRB(Register Reg, MachineRegisterInfo &MRI) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == VGPRRegBankID) + return true; + + return false; +} + +void AMDGPU::cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineInstr *Optional0) { + MI.eraseFromParent(); + if (Optional0 && isTriviallyDead(*Optional0, MRI)) + Optional0->eraseFromParent(); +} + +bool AMDGPU::hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI) { + for (auto &MBB : MF) { + for (auto &MI : make_early_inc_range(MBB)) { + for (MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) + continue; + + if (!isSgprRB(Reg, MRI) || MRI.getType(Reg) != LLT::scalar(1)) + continue; + + MI.getParent()->dump(); + MI.dump(); + return true; + } + } + } + return false; +} + +bool AMDGPU::isS1(Register Reg, MachineRegisterInfo &MRI) { + return MRI.getType(Reg) == LLT::scalar(1); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 4d504d0204d81..bf812dd86fbd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,7 +9,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "AMDGPURegisterBankInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/Register.h" #include @@ -48,7 +52,58 @@ class IntrinsicLaneMaskAnalyzer { // This will not be needed when we turn of LCSSA for global-isel. void findLCSSAPhi(Register Reg); }; + +void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI); + +MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI); + +MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + LLT B32Ty, + const RegisterBankInfo &RBI); + +MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI); + +MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI); + +// Create new vgpr destination register for MI then move it to current +// MI's sgpr destination using one or more G_READANYLANE instructions. +void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI); + +// Share with SIRegisterInfo::isUniformReg? This could make uniformity info give +// same result in later passes. +bool isLaneMask(Register Reg, MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI); + +bool isSgprRB(Register Reg, MachineRegisterInfo &MRI); + +bool isVgprRB(Register Reg, MachineRegisterInfo &MRI); + +template +inline MIPatternMatch::UnaryOp_match +m_GReadAnyLane(const SrcTy &Src) { + return MIPatternMatch::UnaryOp_match(Src); } -} + +void cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineInstr *Optional0 = nullptr); + +bool hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI); + +bool isS1(Register Reg, MachineRegisterInfo &MRI); + +} // namespace AMDGPU +} // namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 800bdbe04cf70..3e1a78050c8a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -217,6 +217,75 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) + .addReg(I.getOperand(1).getReg()) + .addImm(0); + if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) + return false; + + Register DstReg = I.getOperand(0).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI); +} + +bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + std::optional Arg = + getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI); + + if (Arg) { + const int64_t Value = Arg->Value.getZExtValue(); + if (Value == 0) { + unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); + } else { + assert(Value == 1); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(TRI.getExec()); + } + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI); + } + + // RBLegalize was ensures that SrcReg is bool in reg (high bits are 0). + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg); + + unsigned SelectOpcode = + STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) + .addReg(TRI.getExec()) + .addImm(0); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); +} + +bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(SrcReg); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI->getType(DefReg); @@ -249,7 +318,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } } - // TODO: Verify that all registers have the same bank + // If inputs have register bank, assign corresponding reg class. + // Note: registers don't need to have the same reg bank. + for (unsigned i = 1; i < I.getNumOperands(); i += 2) { + const Register SrcReg = I.getOperand(i).getReg(); + + const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg); + if (RB) { + const LLT SrcTy = MRI->getType(SrcReg); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForTypeOnBank(SrcTy, *RB); + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + return false; + } + } + I.setDesc(TII.get(TargetOpcode::PHI)); return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); } @@ -3656,6 +3739,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectStackRestore(I); case AMDGPU::G_PHI: return selectPHI(I); + case AMDGPU::G_COPY_SCC_VCC: + return selectCOPY_SCC_VCC(I); + case AMDGPU::G_COPY_VCC_SCC: + return selectCOPY_VCC_SCC(I); + case AMDGPU::G_READANYLANE: + return selectReadAnyLane(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index df39ecbd61bce..11bba12499f0c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const; bool selectCOPY(MachineInstr &I) const; + bool selectCOPY_SCC_VCC(MachineInstr &I) const; + bool selectCOPY_VCC_SCC(MachineInstr &I) const; + bool selectReadAnyLane(MachineInstr &I) const; bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp index 9a9722559377f..7c348bf759cad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp @@ -18,7 +18,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUGlobalISelUtils.h" +#include "AMDGPURBLegalizeHelper.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" +#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "rb-legalize" @@ -41,6 +47,9 @@ class AMDGPURBLegalize : public MachineFunctionPass { StringRef getPassName() const override { return "AMDGPU RB Legalize"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -56,6 +65,9 @@ class AMDGPURBLegalize : public MachineFunctionPass { INITIALIZE_PASS_BEGIN(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false, false) @@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { + auto Rules = std::make_unique(ST, MRI); + CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { + CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { + MIRBuilder = std::make_unique(); + CSEInfo = &Wrapper.get(TPC.getCSEConfig()); + MIRBuilder->setCSEInfo(CSEInfo); + Observer.addObserver(CSEInfo); + MIRBuilder->setChangeObserver(Observer); + } else { + MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { + for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); + } + } + + for (auto &MI : AllInst) { + if (!MI->isPreISelOpcode()) + continue; + + unsigned Opc = MI->getOpcode(); + + // Insert point for use operands needs some calculation. + if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; + } + + // Opcodes that support pretty much all combinations of reg banks and LLTs + // (except S1). There is no point in writing rules for them. + if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || + Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; + } + + // Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. + // Remaining reg bank and LLT combinations are trivially accepted. + if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && + !isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; + } + + if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); + } + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { + for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { + MI.eraseFromParent(); + continue; + } + + if (MI.getOpcode() == COPY) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (!Dst.isVirtual() || !Src.isVirtual()) + continue; + + // This is cross bank copy, sgpr S1 to lane mask. + // sgpr S1 must be result of G_TRUNC of SGPR S32. + if (isLaneMask(Dst, MRI, TRI) && isSgprRB(Src, MRI)) { + MachineInstr *Trunc = MRI.getVRegDef(Src); + Register SrcSgpr32 = Trunc->getOperand(1).getReg(); + + assert(Trunc->getOpcode() == G_TRUNC); + assert(isSgprRB(SrcSgpr32, MRI) && MRI.getType(SrcSgpr32) == S32); + + MIRBuilder->setInstr(MI); + const RegisterBank *SgprRB = &RBI.getRegBank(SGPRRegBankID); + Register BoolSrc = MRI.createVirtualRegister({SgprRB, S32}); + Register One = MRI.createVirtualRegister({SgprRB, S32}); + // Ensure that truncated bits in BoolSrc are 0. + MIRBuilder->buildConstant(One, 1); + MIRBuilder->buildAnd(BoolSrc, SrcSgpr32, One); + MIRBuilder->buildInstr(G_COPY_VCC_SCC, {Dst}, {BoolSrc}); + cleanUpAfterCombine(MI, MRI, Trunc); + continue; + } + + // Src = G_READANYLANE VgprRBSrc + // Dst = COPY Src + // -> + // Dst = VgprRBSrc + if (isVgprRB(Dst, MRI) && isSgprRB(Src, MRI)) { + MachineInstr *RFL = MRI.getVRegDef(Src); + Register VgprRBSrc; + if (mi_match(RFL, MRI, m_GReadAnyLane(m_Reg(VgprRBSrc)))) { + assert(isVgprRB(VgprRBSrc, MRI)); + MRI.replaceRegWith(Dst, VgprRBSrc); + cleanUpAfterCombine(MI, MRI, RFL); + continue; + } + } + } + + // Sgpr(S1) = G_TRUNC TruncSrc + // Dst = G_ANYEXT Sgpr(S1) + // -> + // Dst = G_... TruncSrc + if (MI.getOpcode() == G_ANYEXT) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (!Dst.isVirtual() || !Src.isVirtual() || MRI.getType(Src) != S1) + continue; + + // Note: Sgpr S16 is anyextened to S32 for some opcodes and could use + // same combine but it is not required for correctness so we skip it. + // S16 is legal because there is instruction with VGPR S16. + + MachineInstr *Trunc = MRI.getVRegDef(Src); + if (Trunc->getOpcode() != G_TRUNC) + continue; + + Register TruncSrc = Trunc->getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT TruncSrcTy = MRI.getType(TruncSrc); + + if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + cleanUpAfterCombine(MI, MRI, Trunc); + continue; + } + + MIRBuilder->setInstr(MI); + + if (DstTy == S32 && TruncSrcTy == S64) { + const RegisterBank *SgprRB = &RBI.getRegBank(SGPRRegBankID); + Register Lo = MRI.createVirtualRegister({SgprRB, S32}); + Register Hi = MRI.createVirtualRegister({SgprRB, S32}); + MIRBuilder->buildUnmerge({Lo, Hi}, TruncSrc); + + MRI.replaceRegWith(Dst, Lo); + cleanUpAfterCombine(MI, MRI, Trunc); + continue; + } + + if (DstTy == S32 && TruncSrcTy == S16) { + MIRBuilder->buildAnyExt(Dst, TruncSrc); + cleanUpAfterCombine(MI, MRI, Trunc); + continue; + } + + if (DstTy == S16 && TruncSrcTy == S32) { + MIRBuilder->buildTrunc(Dst, TruncSrc); + cleanUpAfterCombine(MI, MRI, Trunc); + continue; + } + + llvm_unreachable("missing anyext + trunc combine\n"); + } + } + } + + assert(!hasSGPRS1(MF, MRI) && "detetected SGPR S1 register"); + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp new file mode 100644 index 0000000000000..f58f0a315096d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp @@ -0,0 +1,705 @@ +//===-- AMDGPURBLegalizeHelper.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Implements actual lowering algorithms for each ID that can be used in +/// Rule.OperandMapping. Similar to legalizer helper but with register banks. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURBLegalizeHelper.h" +#include "AMDGPUGlobalISelUtils.h" +#include "AMDGPUInstrInfo.h" + +using namespace llvm; +using namespace AMDGPU; + +bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { + const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI); + const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI); + + SmallSet WaterfallSGPRs; + unsigned OpIdx = 0; + if (Mapping.DstOpMapping.size() > 0) { + B.setInsertPt(*MI.getParent(), std::next(MI.getIterator())); + applyMappingDst(MI, OpIdx, Mapping.DstOpMapping); + } + if (Mapping.SrcOpMapping.size() > 0) { + B.setInstr(MI); + applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSGPRs); + } + + lower(MI, Mapping, WaterfallSGPRs); + return true; +} + +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { + Register BasePtrPlusOffsetReg; + if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; + } else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); + } + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); + Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); + B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + LoadPartRegs.push_back(PartLoad); + ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { + // Loads are of same size, concat or merge them together. + B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { + // Load(s) are not all of same size, need to unmerge them to smaller pieces + // of MergeTy type, then merge them all together in Dst. + SmallVector MergeTyParts; + for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { + MergeTyParts.push_back(Reg); + } else { + auto Unmerge = B.buildUnmerge(MergeTy, Reg); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); + } + } + } + B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { + B.buildTrunc(Dst, WideLoad); + } else { + SmallVector MergeTyParts; + unsigned NumEltsMerge = + MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); + auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + if (i < NumEltsMerge) + MergeTyParts.push_back(UnmergeReg); + } + B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::lower(MachineInstr &MI, + const RegBankLLTMapping &Mapping, + SmallSet &WaterfallSGPRs) { + + switch (Mapping.LoweringMethod) { + case DoNotLower: + return; + case UniExtToSel: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + auto True = + B.buildConstant(createSgpr(Ty), MI.getOpcode() == G_SEXT ? -1 : 1); + auto False = B.buildConstant(createSgpr(Ty), 0); + // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare. + // We are making select here. S1 cond was already 'any-extended to S32' + + // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg. + B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True, + False); + MI.eraseFromParent(); + return; + } + case Ext32To64: { + const RegisterBank *RB = getRegBank(MI.getOperand(0).getReg()); + Register Hi = MRI.createVirtualRegister({RB, S32}); + + if (MI.getOpcode() == AMDGPU::G_ZEXT) { + B.buildConstant(Hi, 0); + } else { + Register ShiftAmt = MRI.createVirtualRegister({RB, S32}); + // Replicate sign bit from 32-bit extended part. + B.buildConstant(ShiftAmt, 31); + B.buildAShr(Hi, MI.getOperand(1).getReg(), ShiftAmt); + } + + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), + {MI.getOperand(1).getReg(), Hi}); + MI.eraseFromParent(); + return; + } + case UniCstExt: { + uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); + B.buildConstant(MI.getOperand(0).getReg(), ConstVal); + + MI.eraseFromParent(); + return; + } + case VgprToVccCopy: { + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Src); + // Take lowest bit from each lane and put it in lane mask. + // Lowering via compare, but we need to clean high bits first as compare + // compares all bits in register. + Register BoolSrc = createVgpr(Ty); + if (Ty == S64) { + auto Src64 = B.buildUnmerge({createVgpr(S32), createVgpr(S32)}, Src); + auto One = B.buildConstant(createVgpr(S32), 1); + auto AndLo = B.buildAnd(createVgpr(S32), Src64.getReg(0), One); + auto Zero = B.buildConstant(createVgpr(S32), 0); + auto AndHi = B.buildAnd(createVgpr(S32), Src64.getReg(1), Zero); + B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi}); + } else { + assert(Ty == S32 || Ty == S16); + auto One = B.buildConstant(createVgpr(Ty), 1); + B.buildAnd(BoolSrc, Src, One); + } + auto Zero = B.buildConstant(createVgpr(Ty), 0); + B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero); + MI.eraseFromParent(); + return; + } + case SplitTo32: { + auto Op1 = B.buildUnmerge({createVgpr(S32), createVgpr(S32)}, + MI.getOperand(1).getReg()); + auto Op2 = B.buildUnmerge({createVgpr(S32), createVgpr(S32)}, + MI.getOperand(2).getReg()); + auto ResLo = B.buildInstr(MI.getOpcode(), {createVgpr(S32)}, + {Op1.getReg(0), Op2.getReg(0)}); + auto ResHi = B.buildInstr(MI.getOpcode(), {createVgpr(S32)}, + {Op1.getReg(1), Op2.getReg(1)}); + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {ResLo, ResHi}); + MI.eraseFromParent(); + break; + } + case SplitLoad: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT V8S16 = LLT::fixed_vector(8, S16); + LLT V4S32 = LLT::fixed_vector(4, S32); + LLT V2S64 = LLT::fixed_vector(2, S64); + + if (DstTy == LLT::fixed_vector(8, S32)) + splitLoad(MI, {V4S32, V4S32}); + else if (DstTy == LLT::fixed_vector(16, S32)) + splitLoad(MI, {V4S32, V4S32, V4S32, V4S32}); + else if (DstTy == LLT::fixed_vector(4, S64)) + splitLoad(MI, {V2S64, V2S64}); + else if (DstTy == LLT::fixed_vector(8, S64)) + splitLoad(MI, {V2S64, V2S64, V2S64, V2S64}); + else if (DstTy == LLT::fixed_vector(16, S16)) + splitLoad(MI, {V8S16, V8S16}); + else if (DstTy == LLT::fixed_vector(32, S16)) + splitLoad(MI, {V8S16, V8S16, V8S16, V8S16}); + else if (DstTy == LLT::scalar(256)) + splitLoad(MI, {LLT::scalar(128), LLT::scalar(128)}); + else if (DstTy == LLT::scalar(96)) + splitLoad(MI, {S64, S32}, S32); + else if (DstTy == LLT::fixed_vector(3, S32)) + splitLoad(MI, {LLT::fixed_vector(2, S32), S32}, S32); + else if (DstTy == LLT::fixed_vector(6, S16)) + splitLoad(MI, {LLT::fixed_vector(4, S16), LLT::fixed_vector(2, S16)}, + LLT::fixed_vector(2, S16)); + else { + MI.dump(); + llvm_unreachable("SplitLoad type not supported\n"); + } + break; + } + case WidenLoad: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (DstTy == LLT::scalar(96)) + widenLoad(MI, LLT::scalar(128)); + else if (DstTy == LLT::fixed_vector(3, S32)) + widenLoad(MI, LLT::fixed_vector(4, S32), S32); + else if (DstTy == LLT::fixed_vector(6, S16)) + widenLoad(MI, LLT::fixed_vector(8, S16), LLT::fixed_vector(2, S16)); + else { + MI.dump(); + llvm_unreachable("WidenLoad type not supported\n"); + } + break; + } + } + + // TODO: executeInWaterfallLoop(... WaterfallSGPRs) +} + +LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMapingApplyID ID) { + switch (ID) { + case Vcc: + case UniInVcc: + return LLT::scalar(1); + case Sgpr16: + return LLT::scalar(16); + case Sgpr32: + case Sgpr32Trunc: + case Sgpr32AExt: + case Sgpr32AExtBoolInReg: + case Sgpr32SExt: + case UniInVgprS32: + case Vgpr32: + return LLT::scalar(32); + case Sgpr64: + case Vgpr64: + return LLT::scalar(64); + case SgprP1: + case VgprP1: + return LLT::pointer(1, 64); + case SgprP3: + case VgprP3: + return LLT::pointer(3, 32); + case SgprP4: + case VgprP4: + return LLT::pointer(4, 64); + case SgprP5: + case VgprP5: + return LLT::pointer(5, 32); + case SgprV4S32: + case VgprV4S32: + case UniInVgprV4S32: + return LLT::fixed_vector(4, 32); + default: + return LLT(); + } +} + +LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMapingApplyID ID, LLT Ty) { + switch (ID) { + case SgprB32: + case VgprB32: + case UniInVgprB32: + if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) || + Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) || + Ty == LLT::pointer(6, 32)) + return Ty; + return LLT(); + case SgprB64: + case VgprB64: + case UniInVgprB64: + if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) || + Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) || + Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64)) + return Ty; + return LLT(); + case SgprB96: + case VgprB96: + case UniInVgprB96: + if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) || + Ty == LLT::fixed_vector(6, 16)) + return Ty; + return LLT(); + case SgprB128: + case VgprB128: + case UniInVgprB128: + if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) || + Ty == LLT::fixed_vector(2, 64)) + return Ty; + return LLT(); + case SgprB256: + case VgprB256: + case UniInVgprB256: + if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) || + Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16)) + return Ty; + return LLT(); + case SgprB512: + case VgprB512: + case UniInVgprB512: + if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) || + Ty == LLT::fixed_vector(8, 64)) + return Ty; + return LLT(); + default: + return LLT(); + } +} + +const RegisterBank * +RegBankLegalizeHelper::getRBFromID(RegBankLLTMapingApplyID ID) { + switch (ID) { + case Vcc: + return VccRB; + + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprP1: + case SgprP3: + case SgprP4: + case SgprP5: + case SgprV4S32: + case SgprB32: + case SgprB64: + case SgprB96: + case SgprB128: + case SgprB256: + case SgprB512: + case UniInVcc: + case UniInVgprS32: + case UniInVgprV4S32: + case UniInVgprB32: + case UniInVgprB64: + case UniInVgprB96: + case UniInVgprB128: + case UniInVgprB256: + case UniInVgprB512: + case Sgpr32Trunc: + case Sgpr32AExt: + case Sgpr32AExtBoolInReg: + case Sgpr32SExt: + return SgprRB; + + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprP3: + case VgprP4: + case VgprP5: + case VgprV4S32: + case VgprB32: + case VgprB64: + case VgprB96: + case VgprB128: + case VgprB256: + case VgprB512: + return VgprRB; + + default: + return nullptr; + } +} + +void RegBankLegalizeHelper::applyMappingDst( + MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs) { + // Defs start from operand 0 + for (; OpIdx < MethodIDs.size(); ++OpIdx) { + if (MethodIDs[OpIdx] == None) + continue; + MachineOperand &Op = MI.getOperand(OpIdx); + Register Reg = Op.getReg(); + LLT Ty = MRI.getType(Reg); + const RegisterBank *RB = getRegBank(Reg); + + switch (MethodIDs[OpIdx]) { + // vcc, sgpr and vgpr scalars, pointers and vectors + case Vcc: + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprP1: + case SgprP3: + case SgprP4: + case SgprP5: + case SgprV4S32: + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprP3: + case VgprP4: + case VgprP5: + case VgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[OpIdx])); + assert(RB == getRBFromID(MethodIDs[OpIdx])); + break; + } + + // sgpr and vgpr B-types + case SgprB32: + case SgprB64: + case SgprB96: + case SgprB128: + case SgprB256: + case SgprB512: + case VgprB32: + case VgprB64: + case VgprB96: + case VgprB128: + case VgprB256: + case VgprB512: { + assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); + assert(RB == getRBFromID(MethodIDs[OpIdx])); + break; + } + + // uniform in vcc/vgpr: scalars, vectors and B-types + case UniInVcc: { + assert(Ty == S1); + assert(RB == SgprRB); + Op.setReg(createVcc()); + auto CopyS32_Vcc = + B.buildInstr(G_COPY_SCC_VCC, {createSgpr(S32)}, {Op.getReg()}); + B.buildTrunc(Reg, CopyS32_Vcc); + break; + } + case UniInVgprS32: + case UniInVgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[OpIdx])); + assert(RB == SgprRB); + AMDGPU::buildReadAnyLaneDst(B, MI, RBI); + break; + } + case UniInVgprB32: + case UniInVgprB64: + case UniInVgprB96: + case UniInVgprB128: + case UniInVgprB256: + case UniInVgprB512: { + assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); + assert(RB == SgprRB); + AMDGPU::buildReadAnyLaneDst(B, MI, RBI); + break; + } + + // sgpr trunc + case Sgpr32Trunc: { + assert(Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + Op.setReg(createSgpr(S32)); + B.buildTrunc(Reg, Op.getReg()); + break; + } + case Invalid: { + MI.dump(); + llvm_unreachable("missing fast rule for MI"); + } + + default: + llvm_unreachable("ID not supported"); + } + } +} + +void RegBankLegalizeHelper::applyMappingSrc( + MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs, + SmallSet &SGPRWaterfallOperandRegs) { + for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) { + if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm) + continue; + + MachineOperand &Op = MI.getOperand(OpIdx); + Register Reg = Op.getReg(); + LLT Ty = MRI.getType(Reg); + const RegisterBank *RB = getRegBank(Reg); + + switch (MethodIDs[i]) { + case Vcc: { + assert(Ty == S1); + assert(RB == VccRB || RB == SgprRB); + + if (RB == SgprRB) { + auto Aext = B.buildAnyExt(createSgpr(S32), Reg); + auto CopyVcc_Scc = B.buildInstr(G_COPY_VCC_SCC, {createVcc()}, {Aext}); + Op.setReg(CopyVcc_Scc.getReg(0)); + } + break; + } + + // sgpr scalars, pointers and vectors + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprP1: + case SgprP3: + case SgprP4: + case SgprP5: + case SgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[i])); + assert(RB == getRBFromID(MethodIDs[i])); + break; + } + // sgpr B-types + case SgprB32: + case SgprB64: + case SgprB96: + case SgprB128: + case SgprB256: + case SgprB512: { + assert(Ty == getBTyFromID(MethodIDs[i], Ty)); + assert(RB == getRBFromID(MethodIDs[i])); + break; + } + + // vgpr scalars, pointers and vectors + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprP3: + case VgprP4: + case VgprP5: + case VgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[i])); + if (RB != VgprRB) { + auto CopyToVgpr = + B.buildCopy(createVgpr(getTyFromID(MethodIDs[i])), Reg); + Op.setReg(CopyToVgpr.getReg(0)); + } + break; + } + // vgpr B-types + case VgprB32: + case VgprB64: + case VgprB96: + case VgprB128: + case VgprB256: + case VgprB512: { + assert(Ty == getBTyFromID(MethodIDs[i], Ty)); + if (RB != VgprRB) { + auto CopyToVgpr = + B.buildCopy(createVgpr(getBTyFromID(MethodIDs[i], Ty)), Reg); + Op.setReg(CopyToVgpr.getReg(0)); + } + break; + } + + // sgpr and vgpr scalars with extend + case Sgpr32AExt: { + // Note: this ext allows S1, and it is meant to be combined away. + assert(Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + auto Aext = B.buildAnyExt(createSgpr(S32), Reg); + Op.setReg(Aext.getReg(0)); + break; + } + case Sgpr32AExtBoolInReg: { + // Note: this ext allows S1, and it is meant to be combined away. + assert(Ty.getSizeInBits() == 1); + assert(RB == SgprRB); + auto Aext = B.buildAnyExt(createSgpr(S32), Reg); + // Zext SgprS1 is not legal, this instruction is most of times meant to be + // combined away in RB combiner, so do not make AND with 1. + auto Cst1 = B.buildConstant(createSgpr(S32), 1); + auto BoolInReg = B.buildAnd(createSgpr(S32), Aext, Cst1); + Op.setReg(BoolInReg.getReg(0)); + break; + } + case Sgpr32SExt: { + assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + auto Sext = B.buildSExt(createSgpr(S32), Reg); + Op.setReg(Sext.getReg(0)); + break; + } + default: + llvm_unreachable("ID not supported"); + } + } +} + +void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + LLT S32 = LLT::scalar(32); + if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) { + B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI()); + + Register NewDst = createSgpr(S32); + B.buildTrunc(Dst, NewDst); + MI.getOperand(0).setReg(NewDst); + + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + Register UseReg = MI.getOperand(i).getReg(); + + auto DefMI = MRI.getVRegDef(UseReg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + + Register NewUseReg = createSgpr(S32); + B.buildAnyExt(NewUseReg, UseReg); + MI.getOperand(i).setReg(NewUseReg); + } + + return; + } + + // ALL divergent i1 phis should be already lowered and inst-selected into PHI + // with sgpr reg class and S1 LLT. + // Note: this includes divergent phis that don't require lowering. + if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) { + llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering " + "before RB-legalize to lower lane mask(vcc) phis\n"); + } + + // We accept all types that can fit in some register class. + // Uniform G_PHIs have all sgpr registers. + // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. + if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) { + return; + } + + MI.dump(); + llvm_unreachable("type not supported\n"); +} + +bool operandsHaveRB(MachineInstr &MI, const RegisterBank *RB, + MachineRegisterInfo &MRI, unsigned StartOpIdx, + unsigned EndOpIdx) { + for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) { + if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB) + return false; + } + return true; +} + +void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) { + const RegisterBank *RB = getRegBank(MI.getOperand(0).getReg()); + // Put RB on all registers + unsigned NumDefs = MI.getNumDefs(); + unsigned NumOperands = MI.getNumOperands(); + + assert(operandsHaveRB(MI, RB, MRI, 0, NumDefs - 1)); + if (RB->getID() == AMDGPU::SGPRRegBankID) + assert(operandsHaveRB(MI, RB, MRI, NumDefs, NumOperands - 1)); + + if (RB->getID() == AMDGPU::VGPRRegBankID) { + for (unsigned i = NumDefs; i < NumOperands; ++i) { + Register Reg = MI.getOperand(i).getReg(); + if (getRegBank(Reg) != RB) { + B.setInstr(MI); + auto Copy = B.buildCopy(createVgpr(MRI.getType(Reg)), Reg); + MI.getOperand(i).setReg(Copy.getReg(0)); + } + } + } +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.h new file mode 100644 index 0000000000000..843d3d523a4a6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.h @@ -0,0 +1,119 @@ +//===- AMDGPURBLegalizeHelper ------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H + +#include "AMDGPURBLegalizeRules.h" +#include "AMDGPURegisterBankInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" + +namespace llvm { +namespace AMDGPU { + +// Receives list of RegBankLLTMapingApplyID and applies register banks on all +// operands. It is user's responsibility to provide RegBankLLTMapingApplyIDs for +// all register operands, there is no need to specify NonReg for trailing imm +// operands. This finishes selection of register banks if there is no need to +// replace instruction. In other case InstApplyMethod will create new +// instruction(s). +class RegBankLegalizeHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const MachineUniformityInfo &MUI; + const RegisterBankInfo &RBI; + const RegBankLegalizeRules &RBLRules; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT V2S32 = LLT::fixed_vector(2, 32); + LLT V3S32 = LLT::fixed_vector(3, 32); + LLT V4S32 = LLT::fixed_vector(4, 32); + LLT V6S32 = LLT::fixed_vector(6, 32); + LLT V7S32 = LLT::fixed_vector(7, 32); + LLT V8S32 = LLT::fixed_vector(8, 32); + + LLT V3S64 = LLT::fixed_vector(3, 64); + LLT V4S64 = LLT::fixed_vector(4, 64); + LLT V16S64 = LLT::fixed_vector(16, 64); + + LLT P1 = LLT::pointer(1, 64); + LLT P4 = LLT::pointer(4, 64); + LLT P6 = LLT::pointer(6, 32); + +public: + RegBankLegalizeHelper(MachineIRBuilder &B, MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI, + const RegisterBankInfo &RBI, + const RegBankLegalizeRules &RBLRules) + : B(B), MRI(MRI), MUI(MUI), RBI(RBI), RBLRules(RBLRules), + SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), + VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), + VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)){}; + + bool findRuleAndApplyMapping(MachineInstr &MI); + + // Manual apply helpers. + void applyMappingPHI(MachineInstr &MI); + void applyMappingTrivial(MachineInstr &MI); + +private: + Register createVgpr(LLT Ty) { + return MRI.createVirtualRegister({VgprRB, Ty}); + } + Register createSgpr(LLT Ty) { + return MRI.createVirtualRegister({SgprRB, Ty}); + } + Register createVcc() { return MRI.createVirtualRegister({VccRB, S1}); } + + const RegisterBank *getRegBank(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + // This assert is not guaranteed by default. RB-select ensures that all + // instructions that we want to RB-legalize have reg banks on all registers. + // There might be a few exceptions. Workaround for them is to not write + // 'mapping' for register operand that is expected to have reg class. + assert(RB); + return RB; + } + + bool executeInWaterfallLoop(MachineIRBuilder &B, + iterator_range Range, + SmallSet &SGPROperandRegs); + + LLT getTyFromID(RegBankLLTMapingApplyID ID); + LLT getBTyFromID(RegBankLLTMapingApplyID ID, LLT Ty); + + const RegisterBank *getRBFromID(RegBankLLTMapingApplyID ID); + + void + applyMappingDst(MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs); + + void + applyMappingSrc(MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs, + SmallSet &SGPRWaterfallOperandRegs); + + void splitLoad(MachineInstr &MI, ArrayRef LLTBreakdown, + LLT MergeTy = LLT()); + void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); + + void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, + SmallSet &SGPRWaterfallOperandRegs); +}; + +} // end namespace AMDGPU +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.cpp new file mode 100644 index 0000000000000..895a596cf84f4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.cpp @@ -0,0 +1,627 @@ +//===-- AMDGPURBLegalizeRules.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURBLegalizeRules.h" +#include "AMDGPUInstrInfo.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Support/AMDGPUAddrSpace.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( + std::initializer_list DstOpMappingList, + std::initializer_list SrcOpMappingList, + LoweringMethodID LoweringMethod) + : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( + std::initializer_list OpList, + std::function TestFunc) + : OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: + return MRI.getType(Reg) == LLT::scalar(1); + case S16: + return MRI.getType(Reg) == LLT::scalar(16); + case S32: + return MRI.getType(Reg) == LLT::scalar(32); + case S64: + return MRI.getType(Reg) == LLT::scalar(64); + case P1: + return MRI.getType(Reg) == LLT::pointer(1, 64); + case P3: + return MRI.getType(Reg) == LLT::pointer(3, 32); + case P4: + return MRI.getType(Reg) == LLT::pointer(4, 64); + case P5: + return MRI.getType(Reg) == LLT::pointer(5, 32); + case B32: + return MRI.getType(Reg).getSizeInBits() == 32; + case B64: + return MRI.getType(Reg).getSizeInBits() == 64; + case B96: + return MRI.getType(Reg).getSizeInBits() == 96; + case B128: + return MRI.getType(Reg).getSizeInBits() == 128; + case B256: + return MRI.getType(Reg).getSizeInBits() == 256; + case B512: + return MRI.getType(Reg).getSizeInBits() == 512; + + case UniS1: + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: + return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + case UniP1: + return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg); + case UniP3: + return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg); + case UniP4: + return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); + case UniP5: + return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniB32: + return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); + case UniB64: + return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg); + case UniB96: + return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg); + case UniB128: + return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg); + case UniB256: + return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg); + case UniB512: + return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg); + + case DivS1: + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: + return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + case DivP3: + return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg); + case DivP4: + return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg); + case DivP5: + return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg); + case DivB32: + return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); + case DivB64: + return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg); + case DivB96: + return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg); + case DivB128: + return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg); + case DivB256: + return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg); + case DivB512: + return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg); + + case _: + return true; + default: + llvm_unreachable("missing matchUniformityAndLLT\n"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { + if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg() && + MI.getOperand(i).getReg() != AMDGPU::NoRegister) + return false; + continue; + } + + // Remaining IDs check registers. + if (!MI.getOperand(i).isReg()) + return false; + + if (!matchUniformityAndLLT(MI.getOperand(i).getReg(), + OpUniformityAndTypes[i], MUI, MRI)) + return false; + } + + // More complex check. + if (TestFunc) + return TestFunc(MI); + + return true; +} + +SetOfRulesForOpcode::SetOfRulesForOpcode() {} + +SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) + : FastTypes(FastTypes) {} + +UniformityLLTOpPredicateID LLTToId(LLT Ty) { + if (Ty == LLT::scalar(16)) + return S16; + if (Ty == LLT::scalar(32)) + return S32; + if (Ty == LLT::scalar(64)) + return S64; + if (Ty == LLT::fixed_vector(2, 16)) + return V2S16; + if (Ty == LLT::fixed_vector(2, 32)) + return V2S32; + if (Ty == LLT::fixed_vector(3, 32)) + return V3S32; + if (Ty == LLT::fixed_vector(4, 32)) + return V4S32; + return _; +} + +UniformityLLTOpPredicateID LLTToBId(LLT Ty) { + if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) || + Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) || + Ty == LLT::pointer(6, 32)) + return B32; + if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) || + Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(1, 64) || + Ty == LLT::pointer(4, 64)) + return B64; + if (Ty == LLT::fixed_vector(3, 32)) + return B96; + if (Ty == LLT::fixed_vector(4, 32)) + return B128; + return _; +} + +const RegBankLLTMapping & +SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const { + // Search in "Fast Rules". + // Note: if fast rules are enabled, RegBankLLTMapping must be added in each + // slot that could "match fast Predicate". If not, Invalid Mapping is + // returned which results in failure, does not search "Slow Rules". + if (FastTypes != No) { + Register Reg = MI.getOperand(0).getReg(); + int Slot; + if (FastTypes == StandardB) + Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg))); + else + Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); + + if (Slot != -1) { + if (MUI.isUniform(Reg)) + return Uni[Slot]; + else + return Div[Slot]; + } + } + + // Slow search for more complex rules. + for (const RBSRule &Rule : Rules) { + if (Rule.Predicate.match(MI, MUI, MRI)) + return Rule.OperandMapping; + } + MI.dump(); + llvm_unreachable("no rules found for MI"); +} + +void SetOfRulesForOpcode::addRule(RBSRule Rule) { Rules.push_back(Rule); } + +void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs) { + int Slot = getFastPredicateSlot(Ty); + assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); + Div[Slot] = RuleApplyIDs; +} + +void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs) { + int Slot = getFastPredicateSlot(Ty); + assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); + Uni[Slot] = RuleApplyIDs; +} + +int SetOfRulesForOpcode::getFastPredicateSlot( + UniformityLLTOpPredicateID Ty) const { + switch (FastTypes) { + case Standard: + switch (Ty) { + case S32: + return 0; + case S16: + return 1; + case S64: + return 2; + case V2S16: + return 3; + default: + return -1; + } + case StandardB: + switch (Ty) { + case B32: + return 0; + case B64: + return 1; + case B96: + return 2; + case B128: + return 3; + default: + return -1; + } + case Vector: + switch (Ty) { + case S32: + return 0; + case V2S32: + return 1; + case V3S32: + return 2; + case V4S32: + return 3; + default: + return -1; + } + default: + return -1; + } +} + +RegBankLegalizeRules::RuleSetInitializer +RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes) { + return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes); +} + +RegBankLegalizeRules::RuleSetInitializer +RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes) { + return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes); +} + +const SetOfRulesForOpcode & +RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT || + Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS || + Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) { + unsigned IntrID = cast(MI).getIntrinsicID(); + // assert(IRules.contains(IntrID)); + if (!IRulesAlias.contains(IntrID)) { + MI.dump(); + llvm_unreachable("no rules for opc"); + } + return IRules.at(IRulesAlias.at(IntrID)); + } + // assert(GRules.contains(Opc)); + if (!GRulesAlias.contains(Opc)) { + MI.dump(); + llvm_unreachable("no rules for opc"); + } + return GRules.at(GRulesAlias.at(Opc)); +} + +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { + // Save formula composed of Pred, '&&', '||' and '!' as a jump table. + // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C + // Sequences of && and || will be represented by jumps, for example: + // (A && B && ... X) or (A && B && ... X) || Y + // A == true jump to B + // A == false jump to end or Y, result is A(false) or Y + // (A || B || ... X) or (A || B || ... X) && Y + // A == true jump to end or Y, result is B(true) or Y + // A == false jump B + // Notice that when negating expression, we apply simply flip Neg on each + // Pred and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). + std::function Pred; + bool Neg; // Neg of Pred is calculated before jump + unsigned TJumpOffset; + unsigned FJumpOffset; + }; + + SmallVector Expression; + + Predicate(std::function Pred) { + Expression.push_back({Pred, false, 1, 1}); + }; + + Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); }; + + bool operator()(const MachineInstr &MI) const { + unsigned Idx = 0; + unsigned ResultIdx = Expression.size(); + bool Result; + do { + Result = Expression[Idx].Pred(MI); + Result = Expression[Idx].Neg ? !Result : Result; + if (Result) { + Idx += Expression[Idx].TJumpOffset; + } else { + Idx += Expression[Idx].FJumpOffset; + } + } while ((Idx != ResultIdx)); + + return Result; + }; + + Predicate operator!() { + SmallVector NegExpression; + for (unsigned i = 0; i < Expression.size(); ++i) { + NegExpression.push_back({Expression[i].Pred, !Expression[i].Neg, + Expression[i].FJumpOffset, + Expression[i].TJumpOffset}); + } + return Predicate(NegExpression); + }; + + Predicate operator&&(Predicate &RHS) { + SmallVector AndExpression = Expression; + + unsigned RHSSize = RHS.Expression.size(); + unsigned ResultIdx = Expression.size(); + for (unsigned i = 0; i < ResultIdx; ++i) { + // LHS results in false, whole expression results in false. + if (i + AndExpression[i].FJumpOffset == ResultIdx) + AndExpression[i].FJumpOffset += RHSSize; + } + + AndExpression.append(RHS.Expression); + + return Predicate(AndExpression); + } + + Predicate operator&&(Predicate &&RHS) { + SmallVector AndExpression = Expression; + + unsigned RHSSize = RHS.Expression.size(); + unsigned ResultIdx = Expression.size(); + for (unsigned i = 0; i < ResultIdx; ++i) { + // LHS results in false, whole expression results in false. + if (i + AndExpression[i].FJumpOffset == ResultIdx) + AndExpression[i].FJumpOffset += RHSSize; + } + + AndExpression.append(RHS.Expression); + + return Predicate(AndExpression); + } + + Predicate operator||(Predicate &RHS) { + SmallVector OrExpression = Expression; + + unsigned RHSSize = RHS.Expression.size(); + unsigned ResultIdx = Expression.size(); + for (unsigned i = 0; i < ResultIdx; ++i) { + // LHS results in true, whole expression results in true. + if (i + OrExpression[i].TJumpOffset == ResultIdx) + OrExpression[i].TJumpOffset += RHSSize; + } + + OrExpression.append(RHS.Expression); + + return Predicate(OrExpression); + } + + Predicate operator||(Predicate &&RHS) { + SmallVector OrExpression = Expression; + + unsigned RHSSize = RHS.Expression.size(); + unsigned ResultIdx = Expression.size(); + for (unsigned i = 0; i < ResultIdx; ++i) { + // LHS results in true, whole expression results in true. + if (i + OrExpression[i].TJumpOffset == ResultIdx) + OrExpression[i].TJumpOffset += RHSSize; + } + + OrExpression.append(RHS.Expression); + + return Predicate(OrExpression); + } +}; + +// Initialize rules +RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, + MachineRegisterInfo &_MRI) + : ST(&_ST), MRI(&_MRI) { + + addRulesForGOpcs({G_ADD}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB) + .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}}) + .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}}) + .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32}); + + addRulesForGOpcs({G_SHL}, Standard) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT + // and G_FREEZE here, rest is trivially regbankselected earlier + addRulesForGOpcs({G_CONSTANT}) + .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}}); + + addRulesForGOpcs({G_ICMP}) + .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + + addRulesForGOpcs({G_FCMP}) + .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + + addRulesForGOpcs({G_BRCOND}) + .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}}) + .Any({{DivS1}, {{}, {Vcc}}}); + + addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}}); + + addRulesForGOpcs({G_SELECT}, StandardB) + .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}}) + .Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}}); + + addRulesForGOpcs({G_ANYEXT}).Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}); + + // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY. + // It is up to user to deal with truncated bits. + addRulesForGOpcs({G_TRUNC}) + .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}) + // This is non-trivial. VgprToVccCopy is done using compare instruction. + .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}); + + addRulesForGOpcs({G_ZEXT, G_SEXT}) + .Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}}) + .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) + .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); + + bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; + bool hasSMRDSmall = ST->hasScalarSubwordLoads(); + + Predicate isAlign16([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->getAlign() >= Align(16); + }); + + Predicate isAlign4([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->getAlign() >= Align(4); + }); + + Predicate isAtomicMMO([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->isAtomic(); + }); + + Predicate isUniMMO([](const MachineInstr &MI) -> bool { + return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin()); + }); + + Predicate isConst([](const MachineInstr &MI) -> bool { + // This is wierd. Can AS in MMO be different then AS on pointer? + // Leaving this for purpose of not changing existing mir tests. + const MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned AS = MMO->getAddrSpace(); + return AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + }); + + Predicate isVolatileMMO([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->isVolatile(); + }); + + Predicate isInvMMO([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->isInvariant(); + }); + + Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool { + return (*MI.memoperands_begin())->getFlags() & MONoClobber; + }); + + Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned MemSize = 8 * MMO->getSize().getValue(); + return (MemSize == 16 && MMO->getAlign() >= Align(2)) || + (MemSize == 8 && MMO->getAlign() >= Align(1)); + }); + + auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) && + (isConst || isInvMMO || isNoClobberMMO); + + // clang-format off + addRulesForGOpcs({G_LOAD}) + .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) + .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) + .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) + .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) + .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) + + .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}}) + .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}}) + .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}}) + + .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}}) + .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load + .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) + .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnAlignedLoads) + .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnAlignedLoads) + .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnAlignedLoads) + .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) + .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) + .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load + .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}) + .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}}) + .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}}) + + .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}); + + addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads + .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}}) + .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); + // clang-format on + + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) + .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) + .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); + + addRulesForGOpcs({G_STORE}) + .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}}) + .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}}) + .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}}); + + addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}); + + addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + + bool hasSALUFloat = ST->hasSALUFloatInsts(); + + addRulesForGOpcs({G_FADD}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_FPTOUI}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + + addRulesForGOpcs({G_UITOFP}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + + using namespace Intrinsic; + + // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir. + addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}}); + + addRulesForIOpcs({amdgcn_if_break}, Standard) + .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}}); + +} // end initialize rulles diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.h new file mode 100644 index 0000000000000..1aa88f0da137c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeRules.h @@ -0,0 +1,319 @@ +//===- AMDGPURBLegalizeRules -------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZERULES_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZERULES_H + +#include "llvm/CodeGen/MachineUniformityAnalysis.h" + +namespace llvm { + +class GCNSubtarget; + +namespace AMDGPU { + +// IDs used to build predicate for RBSRule. Predicate can have one or more IDs +// and each represents a check for 'uniform or divergent' + LLT or just LLT on +// register operand. +// Most often checking one operand is enough to decide which RegBankLLTMapping +// to apply (see Fast Rules), IDs are useful when two or more operands need to +// be checked. +enum UniformityLLTOpPredicateID { + _, + // scalars + S1, + S16, + S32, + S64, + + UniS1, + UniS16, + UniS32, + UniS64, + + DivS1, + DivS32, + DivS64, + + // pointers + P1, + P3, + P4, + P5, + + UniP1, + UniP3, + UniP4, + UniP5, + + DivP1, + DivP3, + DivP4, + DivP5, + + // vectors + V2S16, + V2S32, + V3S32, + V4S32, + + // B types + B32, + B64, + B96, + B128, + B256, + B512, + + UniB32, + UniB64, + UniB96, + UniB128, + UniB256, + UniB512, + + DivB32, + DivB64, + DivB96, + DivB128, + DivB256, + DivB512, +}; + +// How to apply register bank on register operand. +// In most cases, this serves as a LLT and register bank assert. +// Can change operands and insert copies, extends, truncs, and readfirstlanes. +// Anything more complicated requires LoweringMethod. +enum RegBankLLTMapingApplyID { + Invalid, + None, + IntrId, + Imm, + Vcc, + + // sgpr scalars, pointers, vectors and B-types + Sgpr16, + Sgpr32, + Sgpr64, + SgprP1, + SgprP3, + SgprP4, + SgprP5, + SgprV4S32, + SgprB32, + SgprB64, + SgprB96, + SgprB128, + SgprB256, + SgprB512, + + // vgpr scalars, pointers, vectors and B-types + Vgpr32, + Vgpr64, + VgprP1, + VgprP3, + VgprP4, + VgprP5, + VgprB32, + VgprB64, + VgprB96, + VgprB128, + VgprB256, + VgprB512, + VgprV4S32, + + // Dst only modifiers: read-any-lane and truncs + UniInVcc, + UniInVgprS32, + UniInVgprV4S32, + UniInVgprB32, + UniInVgprB64, + UniInVgprB96, + UniInVgprB128, + UniInVgprB256, + UniInVgprB512, + + Sgpr32Trunc, + + // Src only modifiers: waterfalls, extends + Sgpr32AExt, + Sgpr32AExtBoolInReg, + Sgpr32SExt, +}; + +// Instruction needs to be replaced with sequence of instructions. Lowering was +// not done by legalizer since instructions is available in either SGPR or VGPR. +// For example S64 AND is available on SGPR, for that reason S64 AND is legal in +// context of Legalizer that only checks LLT. But S64 AND is not available on +// VGPR. Lower it to two S32 VGPR ANDs. +enum LoweringMethodID { + DoNotLower, + UniExtToSel, + VgprToVccCopy, + SplitTo32, + Ext32To64, + UniCstExt, + SplitLoad, + WidenLoad, +}; + +enum FastRulesTypes { + No, + Standard, // S16, S32, S64, V2S16 + StandardB, // B32, B64, B96, B128 + Vector, // S32, V2S32, V3S32, V4S32 +}; + +struct RegBankLLTMapping { + SmallVector DstOpMapping; + SmallVector SrcOpMapping; + LoweringMethodID LoweringMethod; + RegBankLLTMapping( + std::initializer_list DstOpMappingList, + std::initializer_list SrcOpMappingList, + LoweringMethodID LoweringMethod = DoNotLower); +}; + +struct PredicateMapping { + SmallVector OpUniformityAndTypes; + std::function TestFunc; + PredicateMapping( + std::initializer_list OpList, + std::function TestFunc = nullptr); + + bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const; +}; + +struct RBSRule { + PredicateMapping Predicate; + RegBankLLTMapping OperandMapping; +}; + +class SetOfRulesForOpcode { + // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one. + SmallVector Rules; + + // "Fast Rules" + // Instead of testing each 'Rules[i].Predicate' we do direct access to + // RegBankLLTMapping using getFastPredicateSlot. For example if: + // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32 + // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32 + FastRulesTypes FastTypes = No; +#define InvMapping RegBankLLTMapping({Invalid}, {Invalid}) + RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + +public: + SetOfRulesForOpcode(); + SetOfRulesForOpcode(FastRulesTypes FastTypes); + + const RegBankLLTMapping & + findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const; + + void addRule(RBSRule Rule); + + void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs); + void addFastRuleUniform(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs); + +private: + int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const; +}; + +// Essentially 'map' but a +// little more efficient. +class RegBankLegalizeRules { + const GCNSubtarget *ST; + MachineRegisterInfo *MRI; + // Separate maps for G-opcodes and instrinsics since they are in differents + // enums. Multiple opcodes can share same set of rules. + // RulesAlias = map + // Rules = map + SmallDenseMap GRulesAlias; + SmallDenseMap GRules; + SmallDenseMap IRulesAlias; + SmallDenseMap IRules; + class RuleSetInitializer { + SetOfRulesForOpcode *RuleSet; + + public: + // Used for clang-format line breaks and to force writing all rules for + // opcode in same place. + template + RuleSetInitializer(std::initializer_list OpcList, + AliasMap &RulesAlias, RulesMap &Rules, + FastRulesTypes FastTypes = No) { + unsigned KeyOpcode = *OpcList.begin(); + for (unsigned Opc : OpcList) { + auto [_, NewInput] = RulesAlias.try_emplace(Opc, KeyOpcode); + assert(NewInput && "Can't redefine existing Rules"); + } + + auto [DenseMapIter, NewInput] = Rules.try_emplace(KeyOpcode, FastTypes); + assert(NewInput && "Can't redefine existing Rules"); + + RuleSet = &DenseMapIter->second; + } + + RuleSetInitializer(const RuleSetInitializer &) = delete; + RuleSetInitializer &operator=(const RuleSetInitializer &) = delete; + RuleSetInitializer(RuleSetInitializer &&) = delete; + RuleSetInitializer &operator=(RuleSetInitializer &&) = delete; + ~RuleSetInitializer() = default; + + RuleSetInitializer &Div(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs, + bool STPred = true) { + if (STPred) + RuleSet->addFastRuleDivergent(Ty, RuleApplyIDs); + return *this; + } + + RuleSetInitializer &Uni(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs, + bool STPred = true) { + if (STPred) + RuleSet->addFastRuleUniform(Ty, RuleApplyIDs); + return *this; + } + + RuleSetInitializer &Any(RBSRule Init, bool STPred = true) { + if (STPred) + RuleSet->addRule(Init); + return *this; + } + }; + + RuleSetInitializer addRulesForGOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes = No); + + RuleSetInitializer addRulesForIOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes = No); + +public: + // Initialize rules for all opcodes. + RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI); + + // In case we don't want to regenerate same rules, we can use already + // generated rules but need to refresh references to objects that are + // created for this run. + void refreshRefs(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) { + ST = &_ST; + MRI = &_MRI; + }; + + const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const; +}; + +} // end namespace AMDGPU +} // end namespace llvm + +#endif \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 019ec108670fa..bf71aeb4ceff6 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,8 @@ add_llvm_target(AMDGPUCodeGen AMDGPUGlobalISelDivergenceLowering.cpp AMDGPURBSelect.cpp AMDGPURBLegalize.cpp + AMDGPURBLegalizeRules.cpp + AMDGPURBLegalizeHelper.cpp AMDGPUGlobalISelUtils.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index faa0b6d6c3f50..01d00b5269639 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4108,6 +4108,36 @@ def G_SI_CALL : AMDGPUGenericInstruction { let isConvergent = 1; } +// Uniform in vgpr - vgpr with same value in all active lanes. + +// $dst = $src0 != 0, selected as: +// $dst(SCC) = s_cmp_lg $src0, 0 +// src0 is either exec or 0 (same value for all active lanes), +// for example result of comparison of two uniform in vgpr. +def G_COPY_SCC_VCC : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} + +// $dst = $src0 ? exec : 0, selected as: +// SCC = COPY $src0 +// $dst(SReg_32/64) = s_cselect exec, 0 +def G_COPY_VCC_SCC : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} + +// Move uniform in vgpr to sgpr. Selected as v_readfirstlane_b32. +// Semantic difference in READ ANY instead of FIRST(active) LANE allows for +// vgpr to sgpr back-to vgpr combine, vgpr has same value in all active lanes +// vgprDst = COPY (G_READANYLANE vgprSrc) -> vgprDst = sgprSrc +def G_READANYLANE : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} //============================================================================// // Dummy Instructions diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir index 74af51f0c1676..b9376123e2253 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -check-prefixes=GCN,GFX7 -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s -check-prefixes=GCN,GFX7 -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12 +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="rb-select,rb-legalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7 +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="rb-select,rb-legalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12 --- | define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) { @@ -110,6 +109,7 @@ --- name: load_global_v8i32_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -119,11 +119,21 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32), [[READANYLANE4]](s32), [[READANYLANE5]](s32), [[READANYLANE6]](s32), [[READANYLANE7]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.global.not.uniform.v8i32) ... @@ -131,6 +141,7 @@ body: | --- name: load_global_v4i64_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -141,11 +152,29 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; GCN-NEXT: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE]](s32), [[READANYLANE1]](s32) + ; GCN-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE2]](s32), [[READANYLANE3]](s32) + ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE4]](s32), [[READANYLANE5]](s32) + ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE6]](s32), [[READANYLANE7]](s32) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.global.not.uniform.v4i64) ... @@ -153,6 +182,7 @@ body: | --- name: load_global_v16i32_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -162,17 +192,35 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1) ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1) ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[READANYLANE8:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE9:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[READANYLANE10:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE11:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[READANYLANE12:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV12]] + ; GCN-NEXT: [[READANYLANE13:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV13]] + ; GCN-NEXT: [[READANYLANE14:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV14]] + ; GCN-NEXT: [[READANYLANE15:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV15]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<16 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32), [[READANYLANE4]](s32), [[READANYLANE5]](s32), [[READANYLANE6]](s32), [[READANYLANE7]](s32), [[READANYLANE8]](s32), [[READANYLANE9]](s32), [[READANYLANE10]](s32), [[READANYLANE11]](s32), [[READANYLANE12]](s32), [[READANYLANE13]](s32), [[READANYLANE14]](s32), [[READANYLANE15]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.global.not.uniform.v16i32) ... @@ -180,6 +228,7 @@ body: | --- name: load_global_v8i64_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -189,17 +238,51 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1) ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1) ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE]](s32), [[READANYLANE1]](s32) + ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE2]](s32), [[READANYLANE3]](s32) + ; GCN-NEXT: [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV12]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV13]] + ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE4]](s32), [[READANYLANE5]](s32) + ; GCN-NEXT: [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV14]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV15]] + ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE6]](s32), [[READANYLANE7]](s32) + ; GCN-NEXT: [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV4]](s64) + ; GCN-NEXT: [[READANYLANE8:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV16]] + ; GCN-NEXT: [[READANYLANE9:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV17]] + ; GCN-NEXT: [[MV4:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE8]](s32), [[READANYLANE9]](s32) + ; GCN-NEXT: [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV5]](s64) + ; GCN-NEXT: [[READANYLANE10:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV18]] + ; GCN-NEXT: [[READANYLANE11:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV19]] + ; GCN-NEXT: [[MV5:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE10]](s32), [[READANYLANE11]](s32) + ; GCN-NEXT: [[UV20:%[0-9]+]]:vgpr(s32), [[UV21:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV6]](s64) + ; GCN-NEXT: [[READANYLANE12:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV20]] + ; GCN-NEXT: [[READANYLANE13:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV21]] + ; GCN-NEXT: [[MV6:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE12]](s32), [[READANYLANE13]](s32) + ; GCN-NEXT: [[UV22:%[0-9]+]]:vgpr(s32), [[UV23:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV7]](s64) + ; GCN-NEXT: [[READANYLANE14:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV22]] + ; GCN-NEXT: [[READANYLANE15:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV23]] + ; GCN-NEXT: [[MV7:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE14]](s32), [[READANYLANE15]](s32) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.global.not.uniform.v8i64) ... @@ -207,6 +290,7 @@ body: | --- name: load_global_v8i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -223,6 +307,7 @@ body: | --- name: load_global_v4i64_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -239,6 +324,7 @@ body: | --- name: load_global_v16i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -255,6 +341,7 @@ body: | --- name: load_global_v8i64_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -271,6 +358,7 @@ body: | --- name: load_constant_v8i32_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -280,11 +368,21 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32), [[READANYLANE4]](s32), [[READANYLANE5]](s32), [[READANYLANE6]](s32), [[READANYLANE7]](s32) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.constant.not.uniform.v8i32) ... @@ -292,6 +390,7 @@ body: | --- name: load_constant_i256_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -301,11 +400,21 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY1]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s256) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s256) = G_MERGE_VALUES [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32), [[READANYLANE4]](s32), [[READANYLANE5]](s32), [[READANYLANE6]](s32), [[READANYLANE7]](s32) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s256) = G_LOAD %0 :: (load (s256) from %ir.constant.not.uniform) ... @@ -313,6 +422,7 @@ body: | --- name: load_constant_v16i16_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -323,11 +433,21 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY1]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(<2 x s16>) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:sgpr(<16 x s16>) = G_CONCAT_VECTORS [[READANYLANE]](<2 x s16>), [[READANYLANE1]](<2 x s16>), [[READANYLANE2]](<2 x s16>), [[READANYLANE3]](<2 x s16>), [[READANYLANE4]](<2 x s16>), [[READANYLANE5]](<2 x s16>), [[READANYLANE6]](<2 x s16>), [[READANYLANE7]](<2 x s16>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>) from %ir.constant.not.uniform) ... @@ -335,6 +455,7 @@ body: | --- name: load_constant_v4i64_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -344,11 +465,29 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; GCN-NEXT: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE]](s32), [[READANYLANE1]](s32) + ; GCN-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE2]](s32), [[READANYLANE3]](s32) + ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE4]](s32), [[READANYLANE5]](s32) + ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE6]](s32), [[READANYLANE7]](s32) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.constant.not.uniform.v4i64) ... @@ -356,6 +495,7 @@ body: | --- name: load_constant_v16i32_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -365,17 +505,35 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4) ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4) ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV4]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV5]] + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV6]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV7]] + ; GCN-NEXT: [[READANYLANE8:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE9:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[READANYLANE10:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE11:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[READANYLANE12:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV12]] + ; GCN-NEXT: [[READANYLANE13:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV13]] + ; GCN-NEXT: [[READANYLANE14:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV14]] + ; GCN-NEXT: [[READANYLANE15:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV15]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<16 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32), [[READANYLANE4]](s32), [[READANYLANE5]](s32), [[READANYLANE6]](s32), [[READANYLANE7]](s32), [[READANYLANE8]](s32), [[READANYLANE9]](s32), [[READANYLANE10]](s32), [[READANYLANE11]](s32), [[READANYLANE12]](s32), [[READANYLANE13]](s32), [[READANYLANE14]](s32), [[READANYLANE15]](s32) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.constant.not.uniform.v16i32) ... @@ -383,6 +541,7 @@ body: | --- name: load_constant_v8i64_non_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -392,17 +551,51 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4) ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4) ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) + ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV8]] + ; GCN-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV9]] + ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE]](s32), [[READANYLANE1]](s32) + ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GCN-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV10]] + ; GCN-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV11]] + ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE2]](s32), [[READANYLANE3]](s32) + ; GCN-NEXT: [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GCN-NEXT: [[READANYLANE4:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV12]] + ; GCN-NEXT: [[READANYLANE5:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV13]] + ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE4]](s32), [[READANYLANE5]](s32) + ; GCN-NEXT: [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GCN-NEXT: [[READANYLANE6:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV14]] + ; GCN-NEXT: [[READANYLANE7:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV15]] + ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE6]](s32), [[READANYLANE7]](s32) + ; GCN-NEXT: [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV4]](s64) + ; GCN-NEXT: [[READANYLANE8:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV16]] + ; GCN-NEXT: [[READANYLANE9:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV17]] + ; GCN-NEXT: [[MV4:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE8]](s32), [[READANYLANE9]](s32) + ; GCN-NEXT: [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV5]](s64) + ; GCN-NEXT: [[READANYLANE10:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV18]] + ; GCN-NEXT: [[READANYLANE11:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV19]] + ; GCN-NEXT: [[MV5:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE10]](s32), [[READANYLANE11]](s32) + ; GCN-NEXT: [[UV20:%[0-9]+]]:vgpr(s32), [[UV21:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV6]](s64) + ; GCN-NEXT: [[READANYLANE12:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV20]] + ; GCN-NEXT: [[READANYLANE13:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV21]] + ; GCN-NEXT: [[MV6:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE12]](s32), [[READANYLANE13]](s32) + ; GCN-NEXT: [[UV22:%[0-9]+]]:vgpr(s32), [[UV23:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV7]](s64) + ; GCN-NEXT: [[READANYLANE14:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV22]] + ; GCN-NEXT: [[READANYLANE15:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV23]] + ; GCN-NEXT: [[MV7:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[READANYLANE14]](s32), [[READANYLANE15]](s32) + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.constant.not.uniform.v8i64) ... @@ -410,6 +603,7 @@ body: | --- name: load_constant_v8i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -426,6 +620,7 @@ body: | --- name: load_constant_v16i16_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -442,6 +637,7 @@ body: | --- name: load_constant_v4i64_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -458,6 +654,7 @@ body: | --- name: load_constant_v16i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -474,6 +671,7 @@ body: | --- name: load_constant_v8i64_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -490,6 +688,7 @@ body: | --- name: load_local_uniform legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0 @@ -500,6 +699,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 3) @@ -507,6 +707,7 @@ body: | --- name: load_region_uniform legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0 @@ -525,6 +726,7 @@ body: | --- name: extload_constant_i8_to_i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -535,6 +737,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; GFX7-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] ; ; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform ; GFX12: liveins: $sgpr0_sgpr1 @@ -548,6 +751,7 @@ body: | --- name: extload_global_i8_to_i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -559,6 +763,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1) ... @@ -566,6 +771,7 @@ body: | --- name: extload_constant_i16_to_i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -577,6 +783,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; GFX7-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] ; ; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform ; GFX12: liveins: $sgpr0_sgpr1 @@ -590,6 +797,7 @@ body: | --- name: extload_global_i16_to_i32_uniform legalized: true +tracksRegLiveness: true body: | bb.0: @@ -601,6 +809,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2) ... @@ -608,6 +817,7 @@ body: | --- name: load_constant_i32_uniform_align4 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -624,6 +834,7 @@ body: | --- name: load_constant_i32_uniform_align2 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -635,6 +846,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2) ... @@ -642,6 +854,7 @@ body: | --- name: load_constant_i32_uniform_align1 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -653,6 +866,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1) ... @@ -660,6 +874,7 @@ body: | --- name: load_private_uniform_sgpr_i32 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -706,10 +921,10 @@ body: | ; GCN-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY $vgpr2_vgpr3 ; GCN-NEXT: G_BR %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: @@ -721,14 +936,14 @@ body: | ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr(p4) = COPY [[COPY1]](p4) ; GCN-NEXT: G_BR %bb.1 bb.0: - liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - %0:_(p4) = COPY $sgpr0_sgpr1 - %1:_(p4) = COPY $sgpr2_sgpr3 + %0:_(p4) = COPY $vgpr0_vgpr1 + %1:_(p4) = COPY $vgpr2_vgpr3 G_BR %bb.1 bb.1: @@ -741,6 +956,7 @@ body: | --- name: load_constant_v3i32_align4 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -771,6 +987,7 @@ body: | --- name: load_constant_v3i32_align8 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -801,6 +1018,7 @@ body: | --- name: load_constant_v3i32_align16 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -828,6 +1046,7 @@ body: | --- name: load_constant_v6i16_align4 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -840,10 +1059,9 @@ body: | ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4) - ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) - ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) - ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>) ; ; GFX12-LABEL: name: load_constant_v6i16_align4 ; GFX12: liveins: $sgpr0_sgpr1 @@ -859,6 +1077,7 @@ body: | --- name: load_constant_v6i16_align8 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -871,10 +1090,9 @@ body: | ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4) - ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) - ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) - ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>) ; ; GFX12-LABEL: name: load_constant_v6i16_align8 ; GFX12: liveins: $sgpr0_sgpr1 @@ -890,6 +1108,7 @@ body: | --- name: load_constant_v6i16_align16 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -899,9 +1118,9 @@ body: | ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4) - ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>) - ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>), [[UV2:%[0-9]+]]:sgpr(<2 x s16>), [[UV3:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[UV2]](<2 x s16>) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>) ; ; GFX12-LABEL: name: load_constant_v6i16_align16 ; GFX12: liveins: $sgpr0_sgpr1 @@ -917,6 +1136,7 @@ body: | --- name: load_constant_i96_align4 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -947,6 +1167,7 @@ body: | --- name: load_constant_i96_align8 legalized: true +tracksRegLiveness: true body: | bb.0: @@ -977,6 +1198,7 @@ body: | --- name: load_constant_i96_align16 legalized: true +tracksRegLiveness: true body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-rb-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-rb-legalize.mir index 208bf686c98ba..06d7a4d6121a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-rb-legalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-rb-legalize.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=rb-legalize %s -verify-machineinstrs -o - | FileCheck %s --- name: uniform_in_vgpr @@ -16,9 +16,12 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[COPY]](s32) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY1]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[FPTOUI]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[READANYLANE]], [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -47,10 +50,14 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; CHECK-NEXT: [[FADD:%[0-9]+]]:sgpr(s32) = G_FADD [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[FADD]](s32) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY2]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] + ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[FPTOUI]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[READANYLANE]], [[COPY2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -85,11 +92,20 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; CHECK-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; CHECK-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; CHECK-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV1]], [[C1]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV5]], [[C1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY9]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %3:sgpr(s32) = COPY $sgpr0 %4:sgpr(s32) = COPY $sgpr1 @@ -129,10 +145,12 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[C1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY8]] ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %3:sgpr(s32) = COPY $sgpr0 @@ -172,8 +190,12 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s64) = G_AND [[MV]], [[MV1]] - ; CHECK-NEXT: G_STORE [[AND]](s64), [[MV2]](p1) :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + ; CHECK-NEXT: G_STORE [[MV3]](s64), [[MV2]](p1) :: (store (s64), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %3:vgpr(s32) = COPY $vgpr0 %4:vgpr(s32) = COPY $vgpr1 @@ -204,9 +226,12 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) - ; CHECK-NEXT: [[ABS:%[0-9]+]]:sgpr(s16) = G_ABS [[TRUNC]] - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[ABS]](s16) - ; CHECK-NEXT: G_STORE [[ANYEXT]](s32), [[MV]](p1) :: (store (s16), addrspace 1) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16) + ; CHECK-NEXT: [[ABS:%[0-9]+]]:sgpr(s32) = G_ABS [[SEXT]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ABS]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s16), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %2:sgpr(s32) = COPY $sgpr0 %0:sgpr(s16) = G_TRUNC %2(s32) @@ -235,24 +260,31 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] - ; CHECK-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP1]], [[C2]] + ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.2 ; CHECK-NEXT: G_BR %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sgpr(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[PHI]](s1) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ICMP]](s32), %bb.0, [[ICMP2]](s32), %bb.1 + ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[PHI]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C5]], [[C6]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[C7]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.1: successors: %bb.2(0x30000000), %bb.3(0x50000000) @@ -302,9 +334,15 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[FCMP:%[0-9]+]]:sgpr(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY1]], [[COPY2]] - ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]] + ; CHECK-NEXT: [[COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_COPY_SCC_VCC [[FCMP]](s1) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY_SCC_VCC]], [[C1]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -336,8 +374,9 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_COPY_VCC_SCC [[ICMP]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY_VCC_SCC]](s1), [[COPY1]], [[COPY2]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 @@ -369,8 +408,11 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vcc(s1) = G_TRUNC [[COPY]](s32) - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:vgpr(s32) = COPY $vgpr0 @@ -400,9 +442,14 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[ICMP]](s1) - ; CHECK-NEXT: G_STORE [[ZEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY3]], [[C2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = COPY $vgpr0 @@ -430,9 +477,14 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[ICMP]](s1) - ; CHECK-NEXT: G_STORE [[SEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C2]], [[C3]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = COPY $vgpr0 @@ -461,9 +513,11 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY4]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) @@ -499,12 +553,15 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]] - ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[ICMP1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[AND]], [[C2]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -536,9 +593,10 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) - ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY3]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -548,8 +606,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY4]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY5]](s32) ; CHECK-NEXT: G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.1: @@ -603,20 +661,22 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[PHI1]], [[C2]] - ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:sgpr(s32) = G_UITOFP [[ADD]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[COPY3]](s32) ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY3]](s32) - ; CHECK-NEXT: SI_LOOP [[COPY3]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[COPY4]](s32) + ; CHECK-NEXT: SI_LOOP [[COPY4]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD]](s32), %bb.1 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY4]](s32), %bb.1 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY5]](s32), %bb.1 ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[C3]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY6]] ; CHECK-NEXT: G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.1: @@ -685,37 +745,47 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:vgpr(s64) = G_SEXT [[PHI2]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C1]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[PHI2]], [[C1]](s32) + ; CHECK-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[PHI2]](s32), [[ASHR]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY7]](s32) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY8]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY [[C4]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[C4]], [[COPY10]] + ; CHECK-NEXT: [[COPY_VCC_SCC:%[0-9]+]]:sreg_32(s1) = G_COPY_VCC_SCC [[AND]](s32) ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY_VCC_SCC]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY12]](s32) ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C5]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) - ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1) - ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY10]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) + ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr(s32) = COPY [[C7]](s32) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[C7]], [[COPY15]] + ; CHECK-NEXT: [[COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_COPY_VCC_SCC [[AND1]](s32) + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[COPY_VCC_SCC1]](s1) + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -723,50 +793,54 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %45(s1), %bb.5 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %46(s32), %bb.5, [[DEF]](s32), %bb.1 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY14]](s32) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vcc(s1) = COPY [[COPY13]](s1) - ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI1]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr(s32) = COPY [[COPY16]](s32) - ; CHECK-NEXT: SI_LOOP [[COPY16]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY18]](s32) + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vcc(s1) = COPY [[COPY17]](s1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY19]](s1), [[PHI1]](s32) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr(s32) = COPY [[COPY20]](s32) + ; CHECK-NEXT: SI_LOOP [[COPY20]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C7]](s32) + ; CHECK-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY22]](s32) ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64) ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[C8]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY23]] ; CHECK-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1) - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[C8]] - ; CHECK-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) - ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY24]] + ; CHECK-NEXT: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY25]] + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY16]](s1), $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY26]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY_VCC_SCC1]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1) - ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY21]](s32) - ; CHECK-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY [[COPY27]](s1) + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY29]](s32) + ; CHECK-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY28]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY17]](s32), %bb.3 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY21]](s32), %bb.3 ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; CHECK-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll index d75b1ec0f4274..451b98eb3dbc0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS_GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=OLD_RBS_GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS_GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=NEW_RBS_GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS_GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS_GFX12 %s define amdgpu_ps void @salu_float(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { ; OLD_RBS_GFX10-LABEL: salu_float: @@ -30,7 +30,9 @@ define amdgpu_ps void @salu_float(float inreg %a, float inreg %b, i32 inreg %c, ; NEW_RBS_GFX10: ; %bb.0: ; NEW_RBS_GFX10-NEXT: v_add_f32_e64 v2, s0, s1 ; NEW_RBS_GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; NEW_RBS_GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; NEW_RBS_GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS_GFX10-NEXT: s_add_i32 s0, s0, s2 +; NEW_RBS_GFX10-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS_GFX10-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS_GFX10-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir index 7835db6272ef0..24eb1639f412f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX10 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX12 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX12 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass="rb-select,rb-legalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass="rb-select,rb-legalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX12 --- name: salu_float @@ -58,9 +58,10 @@ body: | ; NEW_RBS_GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; NEW_RBS_GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] ; NEW_RBS_GFX10-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) - ; NEW_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]] - ; NEW_RBS_GFX10-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS_GFX10-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[FPTOUI]] + ; NEW_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[READANYLANE]], [[COPY2]] + ; NEW_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS_GFX10-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS_GFX10-NEXT: S_ENDPGM 0 ; ; NEW_RBS_GFX12-LABEL: name: salu_float diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 287a8ab0e52f5..63dbf3a8d3164 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s ; if instruction is uniform and there is available instruction, select SALU instruction define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { @@ -14,7 +14,9 @@ define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspa ; NEW_RBS-LABEL: uniform_in_vgpr: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS-NEXT: s_add_i32 s0, s0, s1 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %a.i32 = fptoui float %a to i32 @@ -37,7 +39,9 @@ define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_add_f32_e64 v2, s0, s1 ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS-NEXT: s_add_i32 s0, s0, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %add = fadd float %a, %b @@ -63,7 +67,9 @@ define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %vof ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s4 ; NEW_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v3 +; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm .entry: @@ -168,7 +174,8 @@ define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 ; NEW_RBS-NEXT: s_cmp_lt_u32 s0, 1 ; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; NEW_RBS-NEXT: .LBB6_2: ; %exit -; NEW_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000 +; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: s_add_i32 s0, s0, 2 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off @@ -202,9 +209,13 @@ define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, pt ; ; NEW_RBS-LABEL: vcc_to_scc: ; NEW_RBS: ; %bb.0: -; NEW_RBS-NEXT: v_mov_b32_e32 v2, s2 ; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 -; NEW_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0 +; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 +; NEW_RBS-NEXT: s_and_b32 s0, s0, 1 +; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %vcc_to_scc = fcmp oeq float %a, 0.0 @@ -228,9 +239,7 @@ define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) ; NEW_RBS-LABEL: scc_to_vcc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 0 -; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 -; NEW_RBS-NEXT: s_and_b32 s0, 1, s0 -; NEW_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; NEW_RBS-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 ; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off ; NEW_RBS-NEXT: s_endpgm @@ -300,8 +309,7 @@ define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) { ; NEW_RBS-LABEL: sext: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 -; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 -; NEW_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000 +; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm @@ -362,7 +370,6 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cmp_ge_u32 s1, 20 ; NEW_RBS-NEXT: s_cselect_b32 s3, 1, 0 ; NEW_RBS-NEXT: s_and_b32 s2, s2, s3 -; NEW_RBS-NEXT: s_and_b32 s2, s2, 1 ; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, s0, s1 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 @@ -395,12 +402,13 @@ define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1 ; NEW_RBS: ; %bb.0: ; %A ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: s_and_saveexec_b32 s0, vcc_lo ; NEW_RBS-NEXT: ; %bb.1: ; %B -; NEW_RBS-NEXT: s_mov_b32 s0, 1 +; NEW_RBS-NEXT: s_mov_b32 s1, 1 +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm A: @@ -443,19 +451,19 @@ define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, p ; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use: ; NEW_RBS: ; %bb.0: ; %entry ; NEW_RBS-NEXT: s_mov_b32 s0, -1 -; NEW_RBS-NEXT: v_mov_b32_e32 v3, s0 -; NEW_RBS-NEXT: s_mov_b32 s0, 0 +; NEW_RBS-NEXT: s_mov_b32 s1, 0 ; NEW_RBS-NEXT: .LBB15_1: ; %loop ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3 -; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 -; NEW_RBS-NEXT: s_or_b32 s0, vcc_lo, s0 -; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 +; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v3, s0 +; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; NEW_RBS-NEXT: s_or_b32 s1, vcc_lo, s1 +; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; NEW_RBS-NEXT: s_cbranch_execnz .LBB15_1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; NEW_RBS-NEXT: v_mul_lo_u32 v0, v3, 10 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: v_mul_lo_u32 v0, v0, 10 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm entry: @@ -550,9 +558,9 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3 -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo -; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s4 +; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s3 ; NEW_RBS-NEXT: s_or_b32 s1, s1, s3 ; NEW_RBS-NEXT: .LBB16_2: ; %Flow ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 @@ -565,7 +573,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; NEW_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo -; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, -1 +; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, exec_lo ; NEW_RBS-NEXT: s_or_b32 s1, s1, s2 ; NEW_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] ; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 @@ -579,11 +587,11 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo -; NEW_RBS-NEXT: s_mov_b32 s4, -1 +; NEW_RBS-NEXT: s_mov_b32 s3, exec_lo ; NEW_RBS-NEXT: global_load_dword v9, v[9:10], off ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; NEW_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo +; NEW_RBS-NEXT: s_and_saveexec_b32 s4, vcc_lo ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 ; NEW_RBS-NEXT: ; %bb.5: ; %loop.body ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 @@ -591,11 +599,11 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo ; NEW_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6 ; NEW_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; NEW_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo +; NEW_RBS-NEXT: s_andn2_b32 s3, s3, exec_lo ; NEW_RBS-NEXT: global_load_dword v9, v[7:8], off ; NEW_RBS-NEXT: v_mov_b32_e32 v6, v10 ; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo -; NEW_RBS-NEXT: s_or_b32 s4, s4, s5 +; NEW_RBS-NEXT: s_or_b32 s3, s3, s5 ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; NEW_RBS-NEXT: global_store_dword v[7:8], v9, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir index 2d41527a9a480..868928d60dbcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass="rb-select,rb-legalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS --- name: uniform_in_vgpr @@ -34,9 +34,10 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY5]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[FPTOUI]] + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[READANYLANE]], [[COPY1]] + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -87,9 +88,10 @@ body: | ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; NEW_RBS-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[FPTOUI]] + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[READANYLANE]], [[COPY2]] + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -150,11 +152,17 @@ body: | ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; NEW_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; NEW_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY9]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV]] + ; NEW_RBS-NEXT: [[READANYLANE1:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV1]] + ; NEW_RBS-NEXT: [[READANYLANE2:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV2]] + ; NEW_RBS-NEXT: [[READANYLANE3:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[UV3]] + ; NEW_RBS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[READANYLANE]](s32), [[READANYLANE1]](s32), [[READANYLANE2]](s32), [[READANYLANE3]](s32) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV5]], [[C1]] + ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY9]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %3:_(s32) = COPY $sgpr0 %4:_(s32) = COPY $sgpr1 @@ -407,29 +415,28 @@ body: | ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: G_BRCOND [[ZEXT]](s32), %bb.2 + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP1]], [[C2]] + ; NEW_RBS-NEXT: G_BRCOND [[AND]](s32), %bb.2 ; NEW_RBS-NEXT: G_BR %bb.1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.1: ; NEW_RBS-NEXT: successors: %bb.2(0x80000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]] - ; NEW_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; NEW_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s1) + ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]] ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 - ; NEW_RBS-NEXT: [[TRUNC3:%[0-9]+]]:sgpr(s1) = G_TRUNC [[PHI]](s32) - ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC3]](s1) - ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]] + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ICMP]](s32), %bb.0, [[ICMP2]](s32), %bb.1 + ; NEW_RBS-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[PHI]], [[C4]] + ; NEW_RBS-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; NEW_RBS-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C5]], [[C6]] + ; NEW_RBS-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[C7]] ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 @@ -503,10 +510,12 @@ body: | ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; NEW_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]] - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY7]], [[COPY8]] - ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_COPY_SCC_VCC [[FCMP]](s1) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY_SCC_VCC]], [[C1]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -556,9 +565,8 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_COPY_VCC_SCC [[ICMP]](s32) + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY_VCC_SCC]](s1), [[COPY1]], [[COPY2]] ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 @@ -605,9 +613,11 @@ body: | ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $vgpr0 @@ -653,10 +663,13 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ZEXT]](s32) - ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[C1]](s32) + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY3]], [[C2]] + ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $vgpr0 @@ -700,9 +713,12 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SEXT]](s32) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C2]], [[C3]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 @@ -816,16 +832,12 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) - ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] - ; NEW_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[AND]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC2]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[COPY]], [[COPY1]] + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[ICMP1]] + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[AND]], [[C2]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY1]] ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 @@ -887,8 +899,9 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] - ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.1: @@ -897,10 +910,10 @@ body: | ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[PHI]](s32) - ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY5]](s32) + ; NEW_RBS-NEXT: G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 bb.1: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -983,24 +996,26 @@ body: | ; NEW_RBS-NEXT: bb.1: ; NEW_RBS-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 - ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %17(s32), %bb.1, [[C1]](s32), %bb.0 + ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI1]], [[COPY3]] - ; NEW_RBS-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[ADD]](s32) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[PHI1]], [[C2]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[COPY3]](s32) ; NEW_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] - ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) - ; NEW_RBS-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) + ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[COPY4]](s32) + ; NEW_RBS-NEXT: SI_LOOP [[COPY4]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.2 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: ; NEW_RBS-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD]](s32), %bb.1 - ; NEW_RBS-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.1 + ; NEW_RBS-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY5]](s32), %bb.1 ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) - ; NEW_RBS-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY4]] + ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; NEW_RBS-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY6]] ; NEW_RBS-NEXT: G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 bb.1: @@ -1180,29 +1195,30 @@ body: | ; NEW_RBS-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3 - ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 + ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %67(s32), %bb.3, [[C]](s32), %bb.0 ; NEW_RBS-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[PHI2]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 - ; NEW_RBS-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[COPY7]], [[C1]](s32) - ; NEW_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY7]](s32), [[ASHR]](s32) + ; NEW_RBS-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[PHI2]], [[C1]](s32) + ; NEW_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[PHI2]](s32), [[ASHR]](s32) ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; NEW_RBS-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY8]](s32) + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; NEW_RBS-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY7]](s32) ; NEW_RBS-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) ; NEW_RBS-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) - ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY9]] + ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY8]] + ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) ; NEW_RBS-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C4]](s32) - ; NEW_RBS-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC]](s1) + ; NEW_RBS-NEXT: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY [[C4]](s32) + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[C4]], [[COPY10]] + ; NEW_RBS-NEXT: [[COPY_VCC_SCC:%[0-9]+]]:sreg_32(s1) = G_COPY_VCC_SCC [[AND]](s32) ; NEW_RBS-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY_VCC_SCC]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.2 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: @@ -1215,12 +1231,14 @@ body: | ; NEW_RBS-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32) - ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]] + ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]] + ; NEW_RBS-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) ; NEW_RBS-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C7]](s32) - ; NEW_RBS-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1) - ; NEW_RBS-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[COPY15:%[0-9]+]]:sgpr(s32) = COPY [[C7]](s32) + ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[C7]], [[COPY15]] + ; NEW_RBS-NEXT: [[COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_COPY_VCC_SCC [[AND1]](s32) + ; NEW_RBS-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[COPY_VCC_SCC1]](s1) + ; NEW_RBS-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.4 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.3: @@ -1228,49 +1246,54 @@ body: | ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5 ; NEW_RBS-NEXT: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1 - ; NEW_RBS-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI1]](s32) - ; NEW_RBS-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; NEW_RBS-NEXT: [[COPY18:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY18]](s32) + ; NEW_RBS-NEXT: [[COPY19:%[0-9]+]]:vcc(s1) = COPY [[COPY17]](s1) + ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY19]](s1), [[PHI1]](s32) + ; NEW_RBS-NEXT: [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; NEW_RBS-NEXT: [[COPY21:%[0-9]+]]:sgpr(s32) = COPY [[COPY20]](s32) + ; NEW_RBS-NEXT: SI_LOOP [[COPY20]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.6 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.4: ; NEW_RBS-NEXT: successors: %bb.5(0x80000000) ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) - ; NEW_RBS-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY17]](s32) + ; NEW_RBS-NEXT: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) + ; NEW_RBS-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY22]](s32) ; NEW_RBS-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64) ; NEW_RBS-NEXT: [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY18]] + ; NEW_RBS-NEXT: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY23]] ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1) - ; NEW_RBS-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) - ; NEW_RBS-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY19]] + ; NEW_RBS-NEXT: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; NEW_RBS-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY24]] ; NEW_RBS-NEXT: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 - ; NEW_RBS-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) - ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY20]] - ; NEW_RBS-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) - ; NEW_RBS-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) + ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY25]] + ; NEW_RBS-NEXT: [[COPY26:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; NEW_RBS-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY16]](s1), $exec_lo, implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY26]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.5: ; NEW_RBS-NEXT: successors: %bb.3(0x80000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; NEW_RBS-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY_VCC_SCC1]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 ; NEW_RBS-NEXT: [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 - ; NEW_RBS-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) - ; NEW_RBS-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[COPY22]](s1) - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) + ; NEW_RBS-NEXT: [[COPY27:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; NEW_RBS-NEXT: [[COPY28:%[0-9]+]]:sreg_32(s1) = COPY [[COPY27]](s1) + ; NEW_RBS-NEXT: [[COPY29:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY29]](s32) ; NEW_RBS-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY28]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; NEW_RBS-NEXT: G_BR %bb.3 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.6: - ; NEW_RBS-NEXT: [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.3 + ; NEW_RBS-NEXT: [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY21]](s32), %bb.3 ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; NEW_RBS-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir index c2f219dd10cab..1b4796d5eabc8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="rb-select,rb-legalize" -verify-machineinstrs -o - %s | FileCheck %s --- name: zextload_constant_i8_to_i32_uniform @@ -15,6 +14,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 4, align 1) ... @@ -33,6 +33,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 1, align 1) ... @@ -51,6 +52,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 4, align 2) ... @@ -69,6 +71,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 1, align 2) ... @@ -86,6 +89,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p3) :: (load (s8), addrspace 3) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 3, align 1) ... @@ -104,6 +108,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p3) :: (load (s16), addrspace 3) + ; CHECK-NEXT: [[READANYLANE:%[0-9]+]]:sgpr(s32) = G_READANYLANE [[ZEXTLOAD]] %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 3, align 2) ...