diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9a7dd3c31e498..f43831016952a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -73,6 +73,7 @@ enum InstCounterType { SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. BVH_CNT, // gfx12+ only. KM_CNT, // gfx12+ only. + X_CNT, // gfx1250. NUM_EXTENDED_INST_CNTS, NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS }; @@ -102,6 +103,7 @@ struct HardwareLimits { unsigned SamplecntMax; // gfx12+ only. unsigned BvhcntMax; // gfx12+ only. unsigned KmcntMax; // gfx12+ only. + unsigned XcntMax; // gfx1250. }; #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ @@ -111,10 +113,12 @@ struct HardwareLimits { DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \ DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \ DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \ + DECL(VMEM_GROUP) /* vmem group */ \ DECL(LDS_ACCESS) /* lds read & write */ \ DECL(GDS_ACCESS) /* gds read & write */ \ DECL(SQ_MESSAGE) /* send message */ \ DECL(SMEM_ACCESS) /* scalar-memory read & write */ \ + DECL(SMEM_GROUP) /* scalar-memory group */ \ DECL(EXP_GPR_LOCK) /* export holding on its data src */ \ DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \ DECL(EXP_POS_ACCESS) /* write to export position */ \ @@ -178,7 +182,7 @@ enum VmemType { static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, - AMDGPU::S_WAIT_KMCNT}; + AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT}; static bool updateVMCntOnly(const MachineInstr &Inst) { return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) || @@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { return Wait.BvhCnt; case KM_CNT: return Wait.KmCnt; + case X_CNT: + return Wait.XCnt; default: llvm_unreachable("bad InstCounterType"); } @@ -283,12 +289,27 @@ class WaitcntBrackets { return Limits.BvhcntMax; case KM_CNT: return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; default: break; } return 0; } + bool isSmemCounter(InstCounterType T) const { + return T == SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + if (T == SmemAccessCounter) + return 0; + if (T == X_CNT) + return 1; + + llvm_unreachable("Invalid SMEM counter"); + } + unsigned getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); return ScoreLBs[T]; @@ -307,8 +328,8 @@ class WaitcntBrackets { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } - assert(T == SmemAccessCounter); - return SgprScores[GprNo - NUM_ALL_VGPRS]; + assert(isSmemCounter(T)); + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; } bool merge(const WaitcntBrackets &Other); @@ -331,6 +352,7 @@ class WaitcntBrackets { void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); @@ -462,9 +484,11 @@ class WaitcntBrackets { int VgprUB = -1; int SgprUB = -1; unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. - unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; // Bitmask of the VmemTypes of VMEM instructions that might have a pending // write to each vgpr. unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; @@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), 0, 0, + 0, 0}; return WaitEventMaskForInstPreGFX12; @@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), eventMask({VMEM_SAMPLER_READ_ACCESS}), eventMask({VMEM_BVH_READ_ACCESS}), - eventMask({SMEM_ACCESS, SQ_MESSAGE})}; + eventMask({SMEM_ACCESS, SQ_MESSAGE}), + eventMask({VMEM_GROUP, SMEM_GROUP})}; return WaitEventMaskForInstGFX12Plus; } @@ -743,9 +769,12 @@ class SIInsertWaitcnts { return VmemReadMapping[getVmemType(Inst)]; } + bool hasXcnt() const { return ST->hasWaitXCnt(); } + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, @@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval, VgprUB = std::max(VgprUB, RegNo); VgprScores[CntTy][RegNo] = Score; } else { - assert(CntTy == SmemAccessCounter); + assert(isSmemCounter(CntTy)); SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS); - SgprScores[RegNo - NUM_ALL_VGPRS] = Score; + SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score; } } } @@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore); } } + } else if (T == X_CNT) { + for (const MachineOperand &Op : Inst.all_uses()) { + RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + setRegScore(RegNo, T, CurrScore); + } + } } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { // Match the score to the destination registers. // @@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const { case KM_CNT: OS << " KM_CNT(" << SR << "): "; break; + case X_CNT: + OS << " X_CNT(" << SR << "): "; + break; default: OS << " UNKNOWN(" << SR << "): "; break; @@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const { OS << RelScore << ":ds "; } } - // Also need to print sgpr scores for lgkm_cnt. - if (T == SmemAccessCounter) { + // Also need to print sgpr scores for lgkm_cnt or xcnt. + if (isSmemCounter(T)) { for (int J = 0; J <= SgprUB; J++) { unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); if (RegScore <= LB) @@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); simplifyWaitcnt(KM_CNT, Wait.KmCnt); + simplifyWaitcnt(X_CNT, Wait.XCnt); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); applyWaitcnt(BVH_CNT, Wait.BvhCnt); applyWaitcnt(KM_CNT, Wait.KmCnt); + applyXcnt(Wait); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } } +void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { + // Wait on XCNT is redundant if we are already waiting for a load to complete. + // SMEM can return out of order, so only omit XCNT wait if we are waiting till + // zero. + if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) + return applyWaitcnt(X_CNT, 0); + + // If we have pending store we cannot optimize XCnt because we do not wait for + // stores. VMEM loads retun in order, so if we only have loads XCnt is + // decremented to the same number as LOADCnt. + if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT)) + return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + + applyWaitcnt(X_CNT, Wait.XCnt); +} + // Where there are multiple types of event in the bracket of a counter, // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) + if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); } @@ -1263,6 +1322,8 @@ static std::optional counterTypeForInstr(unsigned Opcode) { return DS_CNT; case AMDGPU::S_WAIT_KMCNT: return KM_CNT; + case AMDGPU::S_WAIT_XCNT: + return X_CNT; default: return {}; } @@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { - return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, + ~0u /* XCNT */); } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and @@ -1909,6 +1971,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(BVH_CNT, Interval, Wait); ScoreBrackets.clearVgprVmemTypes(Interval); } + if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { ScoreBrackets.determineWait(EXP_CNT, Interval, Wait); } @@ -1916,6 +1979,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } else { ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } + + if (hasXcnt() && Op.isDef()) + ScoreBrackets.determineWait(X_CNT, Interval, Wait); } } } @@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait.BvhCnt = 0; if (ForceEmitWaitcnt[KM_CNT]) Wait.KmCnt = 0; + if (ForceEmitWaitcnt[X_CNT]) + Wait.XCnt = 0; if (FlushVmCnt) { if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) @@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } + // XCnt may be already consumed by a load wait. + if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && + !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) + Wait.XCnt = ~0u; + + if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && + !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) + Wait.XCnt = ~0u; + + // Since the translation for VMEM addresses occur in-order, we can skip the + // XCnt if the current instruction is of VMEM type and has a memory dependency + // with another VMEM instruction in flight. + if (Wait.XCnt != ~0u && isVmemAccess(*It)) + Wait.XCnt = ~0u; + if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat( }); } +bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { + return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || + (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); +} + static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { auto Opc = Inst.getOpcode(); return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || @@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. + bool IsVMEMAccess = false; + bool IsSMEMAccess = false; if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->isAlwaysGDS(Inst.getOpcode()) || TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { @@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; + IsVMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); } @@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { + IsVMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); @@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { + IsSMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { @@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, break; } } + + if (!hasXcnt()) + return; + + if (IsVMEMAccess) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst); + + if (IsSMEMAccess) + ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst); } bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, @@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (int J = 0; J <= VgprUB; J++) StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); - if (T == SmemAccessCounter) { + if (isSmemCounter(T)) { + unsigned Idx = getSgprScoresIdx(T); for (int J = 0; J <= SgprUB; J++) - StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); + StrictDom |= + mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]); } } @@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); + Limits.XcntMax = AMDGPU::getXcntBitMask(IV); [[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()); @@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) .addImm(0); for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { - if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) + if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT) continue; if (!ST->hasImageInsts() && diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0e5493259edb9..13549e5c4e58b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -129,6 +129,11 @@ unsigned getKmcntBitWidth(unsigned VersionMajor) { return VersionMajor >= 12 ? 5 : 0; } +/// \returns Xcnt bit width. +unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) { + return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0; +} + /// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions. unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { return VersionMajor >= 12 ? 8 : 0; @@ -1493,6 +1498,10 @@ unsigned getKmcntBitMask(const IsaVersion &Version) { return (1 << getKmcntBitWidth(Version.Major)) - 1; } +unsigned getXcntBitMask(const IsaVersion &Version) { + return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1; +} + unsigned getStorecntBitMask(const IsaVersion &Version) { return (1 << getStorecntBitWidth(Version.Major)) - 1; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index ac7c5100be3d4..e6078d6918ac2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -980,6 +980,7 @@ struct Waitcnt { unsigned SampleCnt = ~0u; // gfx12+ only. unsigned BvhCnt = ~0u; // gfx12+ only. unsigned KmCnt = ~0u; // gfx12+ only. + unsigned XCnt = ~0u; // gfx1250. Waitcnt() = default; // Pre-gfx12 constructor. @@ -988,15 +989,15 @@ struct Waitcnt { // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, - unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt) + unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt) : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), - SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} + SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {} bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } bool hasWaitExceptStoreCnt() const { return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u || - SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u; + SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u; } bool hasWaitStoreCnt() const { return StoreCnt != ~0u; } @@ -1008,7 +1009,7 @@ struct Waitcnt { std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt), std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt), std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt), - std::min(KmCnt, Other.KmCnt)); + std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt)); } }; @@ -1114,6 +1115,10 @@ unsigned getDscntBitMask(const IsaVersion &Version); /// Returns 0 for versions that do not support KMcnt unsigned getKmcntBitMask(const IsaVersion &Version); +/// \returns Xcnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support Xcnt. +unsigned getXcntBitMask(const IsaVersion &Version); + /// \return STOREcnt or VScnt bit mask for given isa \p Version. /// returns 0 for versions that do not support STOREcnt or VScnt. /// STOREcnt and VScnt are the same counter, the name used diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll new file mode 100644 index 0000000000000..70ea0688c8a49 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -0,0 +1,569 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s + +; Test S_WAIT_XCNT insertion for global_load/store instructions. +; Introduced additional operations in between the clauses to have the register dependency +; between the operands of VMEM operations and the def ops of VALU instructions that followed. + +define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) { +; GCN-SDAG-LABEL: test_i8load_v4i8store: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: global_load_u8 v2, v[2:3], off +; GCN-SDAG-NEXT: global_load_u8 v3, v[4:5], off +; GCN-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 +; GCN-SDAG-NEXT: v_lshlrev_b16 v2, 8, v3 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_i8load_v4i8store: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: global_load_u8 v0, v[0:1], off +; GCN-GISEL-NEXT: global_load_u8 v1, v[2:3], off +; GCN-GISEL-NEXT: global_load_u8 v2, v[4:5], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x1 +; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 +; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %a = load i8, ptr addrspace(1) %ptr_a + %b = load i8, ptr addrspace(1) %ptr_b + %c = load i8, ptr addrspace(1) %ptr_c + %d = load i8, ptr addrspace(1) %ptr_d + %ins_0 = insertelement <4 x i8> poison, i8 %a, i32 0 + %ins_1 = insertelement <4 x i8> %ins_0, i8 %b, i32 1 + %ins_2 = insertelement <4 x i8> %ins_1, i8 %c, i32 2 + %ins_3 = insertelement <4 x i8> %ins_2, i8 %c, i32 3 + store <4 x i8> %ins_3, ptr addrspace(1) %out + ret void +} + +define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) { +; GCN-SDAG-LABEL: test_v7i16_load_store: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[2:3], off +; GCN-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: v_pk_add_u16 v10, v6, v2 +; GCN-SDAG-NEXT: v_pk_add_u16 v11, v7, v3 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 12 +; GCN-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v6, 8 +; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v0 +; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GCN-SDAG-NEXT: v_mov_b32_e32 v7, 0 +; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v1 +; GCN-SDAG-NEXT: s_clause 0x2 +; GCN-SDAG-NEXT: global_store_b16 v[2:3], v11, off +; GCN-SDAG-NEXT: global_store_b32 v[6:7], v10, off +; GCN-SDAG-NEXT: global_store_b64 v[8:9], v[4:5], off +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_v7i16_load_store: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[2:3], off +; GCN-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GCN-GISEL-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2 +; GCN-GISEL-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 4 +; GCN-GISEL-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6 +; GCN-GISEL-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8 +; GCN-GISEL-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10 +; GCN-GISEL-NEXT: v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: v_pk_add_u16 v2, v6, v2 +; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v0 +; GCN-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 +; GCN-GISEL-NEXT: v_pk_add_u16 v3, v7, v3 +; GCN-GISEL-NEXT: s_clause 0x6 +; GCN-GISEL-NEXT: global_store_b16 v[8:9], v4, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v4, off +; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off +; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off +; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off +; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1 + %insert = insertelement <7 x i16> %vec1, i16 20, i32 4 + %vec2 = load <7 x i16>, ptr addrspace(1) %ptr2 + %add = add <7 x i16> %vec1, %vec2 + store <7 x i16> %add, ptr addrspace(1) null + %elt = extractelement <7 x i16> %add, i32 5 + ret i16 %elt +} + +define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspace(1) %out) { +; GCN-SDAG-LABEL: test_v64i32_load_store: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: s_clause 0xd +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 +; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 +; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 +; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 +; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 +; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 +; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 +; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 +; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 +; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 +; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 +; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224 +; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: s_clause 0xd +; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192 +; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208 +; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:160 +; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:176 +; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:128 +; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:144 +; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:96 +; GCN-SDAG-NEXT: global_load_b128 v[48:51], v[0:1], off offset:112 +; GCN-SDAG-NEXT: global_load_b128 v[52:55], v[0:1], off offset:64 +; GCN-SDAG-NEXT: global_load_b128 v[38:41], v[0:1], off offset:80 +; GCN-SDAG-NEXT: global_load_b128 v[42:45], v[0:1], off offset:32 +; GCN-SDAG-NEXT: global_load_b128 v[56:59], v[0:1], off offset:48 +; GCN-SDAG-NEXT: global_load_b128 v[60:63], v[0:1], off +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 +; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:224 +; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: s_clause 0xe +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:240 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:192 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:208 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:160 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:176 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:128 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:144 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:96 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:112 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[52:55], off offset:64 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[38:41], off offset:80 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[42:45], off offset:32 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[56:59], off offset:48 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[60:63], off +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16 +; GCN-SDAG-NEXT: s_clause 0xd +; GCN-SDAG-NEXT: scratch_load_b32 v63, off, s32 +; GCN-SDAG-NEXT: scratch_load_b32 v62, off, s32 offset:4 +; GCN-SDAG-NEXT: scratch_load_b32 v61, off, s32 offset:8 +; GCN-SDAG-NEXT: scratch_load_b32 v60, off, s32 offset:12 +; GCN-SDAG-NEXT: scratch_load_b32 v59, off, s32 offset:16 +; GCN-SDAG-NEXT: scratch_load_b32 v58, off, s32 offset:20 +; GCN-SDAG-NEXT: scratch_load_b32 v57, off, s32 offset:24 +; GCN-SDAG-NEXT: scratch_load_b32 v56, off, s32 offset:28 +; GCN-SDAG-NEXT: scratch_load_b32 v45, off, s32 offset:32 +; GCN-SDAG-NEXT: scratch_load_b32 v44, off, s32 offset:36 +; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 offset:40 +; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:44 +; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:48 +; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:52 +; GCN-SDAG-NEXT: s_wait_xcnt 0xe +; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_v64i32_load_store: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: s_clause 0xf +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 +; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 +; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 +; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 +; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 +; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 +; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 +; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 +; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 +; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 +; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 +; GCN-GISEL-NEXT: s_wait_xcnt 0x8 +; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4 +; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: s_clause 0xe +; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48 +; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64 +; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80 +; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96 +; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:112 +; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:128 +; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:144 +; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:160 +; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:176 +; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:192 +; GCN-GISEL-NEXT: global_load_b128 v[38:41], v[0:1], off offset:208 +; GCN-GISEL-NEXT: global_load_b128 v[42:45], v[0:1], off offset:224 +; GCN-GISEL-NEXT: global_load_b128 v[56:59], v[0:1], off +; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16 +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: s_clause 0xe +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:32 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[6:9], off offset:48 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[10:13], off offset:64 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[14:17], off offset:80 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[18:21], off offset:96 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[22:25], off offset:112 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[26:29], off offset:128 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[30:33], off offset:144 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[34:37], off offset:160 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[48:51], off offset:176 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[52:55], off offset:192 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[38:41], off offset:208 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[42:45], off offset:224 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[56:59], off +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[60:63], off offset:16 +; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:240 +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v62 +; GCN-GISEL-NEXT: s_clause 0xf +; GCN-GISEL-NEXT: scratch_load_b32 v63, off, s32 +; GCN-GISEL-NEXT: scratch_load_b32 v62, off, s32 offset:4 +; GCN-GISEL-NEXT: scratch_load_b32 v61, off, s32 offset:8 +; GCN-GISEL-NEXT: scratch_load_b32 v60, off, s32 offset:12 +; GCN-GISEL-NEXT: scratch_load_b32 v59, off, s32 offset:16 +; GCN-GISEL-NEXT: scratch_load_b32 v58, off, s32 offset:20 +; GCN-GISEL-NEXT: scratch_load_b32 v57, off, s32 offset:24 +; GCN-GISEL-NEXT: scratch_load_b32 v56, off, s32 offset:28 +; GCN-GISEL-NEXT: scratch_load_b32 v47, off, s32 offset:32 +; GCN-GISEL-NEXT: scratch_load_b32 v46, off, s32 offset:36 +; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32 offset:40 +; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:44 +; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:48 +; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:52 +; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:56 +; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:60 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %vec = load <64 x i32>, ptr addrspace(1) %ptr + store <64 x i32> %vec, ptr addrspace(1) %out, align 4 + %elt = extractelement <64 x i32> %vec, i32 6 + ret i32 %elt +} + +define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %out) { +; GCN-SDAG-LABEL: test_v16i64_load_store: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: s_clause 0x3 +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 +; GCN-SDAG-NEXT: s_clause 0x7 +; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112 +; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96 +; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:80 +; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48 +; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:32 +; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:16 +; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 +; GCN-SDAG-NEXT: v_mov_b32_e32 v16, 0x70 +; GCN-SDAG-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v50, 0x60 +; GCN-SDAG-NEXT: v_dual_mov_b32 v51, 0 :: v_dual_mov_b32 v52, 48 +; GCN-SDAG-NEXT: v_dual_mov_b32 v38, 0x50 :: v_dual_mov_b32 v53, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v54, 32 +; GCN-SDAG-NEXT: v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0 +; GCN-SDAG-NEXT: v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 64 +; GCN-SDAG-NEXT: v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v40, 16 +; GCN-SDAG-NEXT: v_mov_b32_e32 v49, 0 +; GCN-SDAG-NEXT: v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v43, 0 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 +; GCN-SDAG-NEXT: global_store_b128 v[16:17], v[6:9], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 +; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[10:13], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x5 +; GCN-SDAG-NEXT: s_wait_xcnt 0x1 +; GCN-SDAG-NEXT: v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7] +; GCN-SDAG-NEXT: s_wait_loadcnt 0x4 +; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x3 +; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[30:33], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 +; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[22:25], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 +; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[26:29], off +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: s_wait_xcnt 0x3 +; GCN-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1] +; GCN-SDAG-NEXT: s_wait_xcnt 0x1 +; GCN-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23] +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64 +; GCN-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21] +; GCN-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8 +; GCN-SDAG-NEXT: s_clause 0x1 +; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off +; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off +; GCN-SDAG-NEXT: s_clause 0x7 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:80 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:32 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:16 +; GCN-SDAG-NEXT: s_clause 0x3 +; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 +; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4 +; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8 +; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12 +; GCN-SDAG-NEXT: s_wait_xcnt 0xc +; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_v16i64_load_store: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: s_clause 0x5 +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 +; GCN-GISEL-NEXT: s_clause 0x7 +; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80 +; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off +; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16 +; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:32 +; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:48 +; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96 +; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112 +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 +; GCN-GISEL-NEXT: v_mov_b32_e32 v34, 0xc8 +; GCN-GISEL-NEXT: v_dual_mov_b32 v35, 0 :: v_dual_mov_b32 v38, 0 +; GCN-GISEL-NEXT: v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 16 +; GCN-GISEL-NEXT: v_dual_mov_b32 v49, 0 :: v_dual_mov_b32 v50, 32 +; GCN-GISEL-NEXT: v_dual_mov_b32 v52, 48 :: v_dual_mov_b32 v51, 0 +; GCN-GISEL-NEXT: v_dual_mov_b32 v53, 0 :: v_dual_mov_b32 v54, 64 +; GCN-GISEL-NEXT: v_dual_mov_b32 v40, 0x50 :: v_dual_mov_b32 v55, 0 +; GCN-GISEL-NEXT: v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0x60 +; GCN-GISEL-NEXT: v_dual_mov_b32 v44, 0x70 :: v_dual_mov_b32 v43, 0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v45, 0 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x7 +; GCN-GISEL-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v36, v8 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9] +; GCN-GISEL-NEXT: s_wait_loadcnt 0x6 +; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x5 +; GCN-GISEL-NEXT: global_store_b128 v[48:49], v[14:17], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x4 +; GCN-GISEL-NEXT: global_store_b128 v[50:51], v[18:21], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x3 +; GCN-GISEL-NEXT: global_store_b128 v[52:53], v[22:25], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x2 +; GCN-GISEL-NEXT: global_store_b128 v[42:43], v[26:29], off +; GCN-GISEL-NEXT: s_wait_loadcnt 0x1 +; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off +; GCN-GISEL-NEXT: s_wait_xcnt 0x5 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13] +; GCN-GISEL-NEXT: s_wait_xcnt 0x4 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17] +; GCN-GISEL-NEXT: s_wait_xcnt 0x3 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64 +; GCN-GISEL-NEXT: s_wait_xcnt 0x2 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25] +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3] +; GCN-GISEL-NEXT: s_wait_xcnt 0x1 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29] +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31] +; GCN-GISEL-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33] +; GCN-GISEL-NEXT: s_clause 0x1 +; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off +; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off +; GCN-GISEL-NEXT: s_clause 0x7 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[18:21], off offset:32 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[22:25], off offset:48 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[48:51], off offset:64 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96 +; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112 +; GCN-GISEL-NEXT: s_clause 0x5 +; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32 +; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:4 +; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:8 +; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:12 +; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:16 +; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:20 +; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13 +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4 + %in_a = insertelement <16 x i64> %a, i64 100, i32 5 + store <16 x i64> %in_a, ptr addrspace(1) null + %b = load <16 x i64>, ptr addrspace(1) %ptr_b, align 4 + %in_b = insertelement <16 x i64> %a, i64 200, i32 10 + store <16 x i64> %in_b, ptr addrspace(1) null + %add = add <16 x i64> %in_a, %in_b + store <16 x i64> %add, ptr addrspace(1) %out, align 4 + %elt = extractelement <16 x i64> %add, i32 1 + ret i64 %elt +} + +define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2, ptr addrspace(1) %out) { +; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v8, 12 +; GCN-SDAG-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 8 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-SDAG-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GCN-SDAG-NEXT: v_mov_b32_e32 v12, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v13, 0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: s_clause 0x1 +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7 +; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6 +; GCN-SDAG-NEXT: v_pk_add_u16 v1, v1, v5 +; GCN-SDAG-NEXT: v_pk_add_u16 v0, v0, v4 +; GCN-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GCN-SDAG-NEXT: s_clause 0x2 +; GCN-SDAG-NEXT: global_store_b16 v[8:9], v3, off +; GCN-SDAG-NEXT: global_store_b32 v[10:11], v2, off +; GCN-SDAG-NEXT: global_store_b64 v[12:13], v[0:1], off +; GCN-SDAG-NEXT: global_store_d16_hi_b16 v4, v2, s[4:5] +; GCN-SDAG-NEXT: s_endpgm +; +; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GCN-GISEL-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2 +; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-GISEL-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0 +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GCN-GISEL-NEXT: v_mov_b32_e32 v12, 4 +; GCN-GISEL-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6 +; GCN-GISEL-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8 +; GCN-GISEL-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10 +; GCN-GISEL-NEXT: v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: s_clause 0x1 +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4 +; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5 +; GCN-GISEL-NEXT: v_pk_add_u16 v2, v2, v6 +; GCN-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GCN-GISEL-NEXT: v_pk_add_u16 v3, v3, v7 +; GCN-GISEL-NEXT: s_clause 0x6 +; GCN-GISEL-NEXT: global_store_b16 v[8:9], v0, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v0, off +; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off +; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off +; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v4, v2, s[4:5] +; GCN-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr1, i32 %tid + %gep2 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr2, i32 %tid + %vec1 = load <7 x i16>, ptr addrspace(1) %gep1 + %insert = insertelement <7 x i16> %vec1, i16 20, i32 4 + %vec2 = load <7 x i16>, ptr addrspace(1) %gep2 + %add = add <7 x i16> %vec1, %vec2 + store <7 x i16> %add, ptr addrspace(1) null + %elt = extractelement <7 x i16> %add, i32 5 + store i16 %elt, ptr addrspace(1) %out + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir new file mode 100644 index 0000000000000..73b994ab2ab8c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -0,0 +1,922 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: vmem_scratch_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: vmem_scratch_load + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec + $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec +... + +--- +name: vmem_buffer_load_dword_offset +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-LABEL: name: vmem_buffer_load_dword_offset + ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec +... + +--- +name: vmem_buffer_load_addr +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-LABEL: name: vmem_buffer_load_addr + ; GCN: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec +... + +--- +name: vmem_flat_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vmem_flat_load + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec +... + +--- +name: vmem_global_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vmem_global_load + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec:: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1) + $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec +... + +--- +name: vmem_global_store +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: vmem_global_store + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec +... + +--- +name: vmem_buffer_store +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vmem_buffer_store + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec + $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec +... + +--- +name: vmem_scratch_store +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: vmem_scratch_store + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec + SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec +... + +--- +name: smem_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr2_sgpr3 + ; GCN-LABEL: name: smem_load + ; GCN: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4) + $sgpr2 = S_MOV_B32 0 +... + +--- +name: smem_store +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0, $sgpr2, $sgpr3 + ; GCN-LABEL: name: smem_store + ; GCN: liveins: $sgpr0, $sgpr2, $sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0 + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0 + $sgpr3 = S_MOV_B32 0 +... + +# 4 global_load instructions together form a load-group. + +--- +name: vmem_load_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-LABEL: name: vmem_load_group + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 2 + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec +... + +# The contiguous stores form a single group. + +--- +name: vmem_store_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-LABEL: name: vmem_store_group + ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec +... + +--- +name: smem_load_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GCN-LABEL: name: smem_load_group + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + $sgpr2 = S_MOV_B32 0 +... + +--- +name: smem_store_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GCN-LABEL: name: smem_store_group + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0 + S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0 + S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0 + S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0 + $sgpr2 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 0 +... + +# The four global_load instructions form two separate groups due to the interveing s_nop. + +--- +name: vmem_loads_with_an_intervening_nop +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-LABEL: name: vmem_loads_with_an_intervening_nop + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 2 + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + S_NOP 0 + $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec +... + +--- +name: vmem_contiguous_loads_with_an_intervening_store +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-LABEL: name: vmem_contiguous_loads_with_an_intervening_store + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec + ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 2 + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec + $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec +... + +--- +name: vmem_stores_with_intervening_nop +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-LABEL: name: vmem_stores_with_intervening_nop + ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + S_NOP 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec +... + +# The intervening load breaks the store group and form two distict store groups. + +--- +name: vmem_contiguous_stores_with_an_intervening_load +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-LABEL: name: vmem_contiguous_stores_with_an_intervening_load + ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec +... + +# Atomic operations should not form a group. But they are memory instructions and should increment +# the xcnt counter value as they might cause register dependnecy. This test ensures S_WAIT_XCNT +# insertion for such cases. + +--- +name: atomic_op +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-LABEL: name: atomic_op + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1) + ; GCN-NEXT: GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) + ; GCN-NEXT: $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_XCNT 2 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 1 + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1) + GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) + $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec +... + +# Force insert S_WAIT_XCNT 0 for dependency in SMEM instruction even though +# there is a pending VMEM dependency. + +--- +name: smem_xcnt_insertion_with_pending_vmem_event +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GCN-LABEL: name: smem_xcnt_insertion_with_pending_vmem_event + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 4, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec + $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_MOV_B32_e32 2, implicit $exec + $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr5 = V_MOV_B32_e32 4, implicit $exec + GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec + $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc + $vgpr0 = V_MOV_B32_e32 20, implicit $exec +... + +# The second instruction in the flat_load group has a WAR dependency with a prior +# memory opeartion (scratch_load instruction). + +--- +name: vmem_group_reg_dependency_with_prior_instruction +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr4, $vgpr5 + ; GCN-LABEL: name: vmem_group_reg_dependency_with_prior_instruction + ; GCN: liveins: $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr3 = V_MOV_B32_e32 1, implicit $exec +... + +# Two instructions inside the load group have dependencies with prior instructions. + +--- +name: multiple_xcnt_insertion_in_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: multiple_xcnt_insertion_in_group + ; GCN: liveins: $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1) + $vgpr8 = V_MOV_B32_e32 1, implicit $exec +... + +--- +name: xcnt_event_post_load_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-LABEL: name: xcnt_event_post_load_group + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 3 + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 2 + ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 1 + ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec + $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec + $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec +... + +# The three V_MOV_B32 instructions waiting outside the group needs appropriate wait_xcnt +# insertion as their dst registers have dependencies with instructions inside the group. + +--- +name: xcnt_event_post_store_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-LABEL: name: xcnt_event_post_store_group + ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 8 + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 6 + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 4 + ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec + $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr + GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr7 = V_MOV_B32_e32 2, implicit $exec + $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec +... + +# This test captures the case that interleaving load store operations form separate groups. +# The registers in V_MOV_B32 are all have dependency with these independent groups and +# should have the wait_xcnt insertion with appropriate wait values. + +--- +name: load_store_switching +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: load_store_switching + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 2 + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 3, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec + $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec + $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec + GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr8 = V_MOV_B32_e32 2, implicit $exec + $vgpr5 = V_MOV_B32_e32 3, implicit $exec +... + +# V_DUAL_MOV is a single instruction and should emit required xcnt +# if the destination registers have any memory-op dependency. + +--- +name: dual_mov +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr1 + ; GCN-LABEL: name: dual_mov + ; GCN: liveins: $sgpr0, $sgpr1, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec +... + +# No xcnt wait insertion for DS load/store operations. + +--- +name: ds_load_store +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + ; GCN-LABEL: name: ds_load_store + ; GCN: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) undef`, addrspace 3) + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: S_WAIT_DSCNT 0 + ; GCN-NEXT: DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) undef`, addrspace 3) + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`) + $vgpr1 = V_MOV_B32_e32 2, implicit $exec + DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 20, implicit $exec +... + +--- +name: xcnt_max +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: xcnt_max + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 62 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec +...