diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 35dddf85596d9..fc0df61952e48 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -61,15 +61,6 @@ static cl::opt UseCostHeur( "Experimentally, results are mixed, so this should be set on a " "case-by-case basis.")); -static cl::opt EnableLowerBound( - "amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden, - cl::desc("Whether to use a lower bound when calculating the cost " - "for a partial fit using the exact solver. The lower bound " - "calculates the cost of assigning the remaining instructions " - "under idealized conditions. The LB reduces the overall search " - "space but adds time complexity per branch explored."), - cl::init(false)); - // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. enum class SchedGroupMask { @@ -118,11 +109,7 @@ class SchedGroup { const SIInstrInfo *TII; - // Try to add and edge from SU A to SU B. This returns false if there is a - // dependency which makes adding the A->B edge impossible, otherwise it - // returns true. The result is that it will return true even if no edge was - // added. For example, if there is already an edge between A->B, this will - // return true, even though DAG->addEdge does not add edge. + // Try to add and edge from SU A to SU B. bool tryAddEdge(SUnit *A, SUnit *B); // Use SGMask to determine whether we can classify MI as a member of this @@ -144,7 +131,7 @@ class SchedGroup { // Add DAG dependencies and track which edges are added, and the count of // missed edges int link(SUnit &SU, bool MakePred, - SmallVectorImpl> &AddedEdges); + std::vector> &AddedEdges); // Add DAG dependencies from all SUnits in this SchedGroup and this SU. // Use the predicate to determine whether SU should be a predecessor (P = @@ -256,9 +243,6 @@ class PipelineSolver { int BestCost = -1; int CurrCost = 0; - // A lower bound on the optimal cost for a complete pipeline - int StaticLowerBound = 0; - // Index pointing to the conflicting instruction that is currently being // fitted int CurrConflInstNo = 0; @@ -286,19 +270,14 @@ class PipelineSolver { void populateReadyList(SUToCandSGsPair &CurrSU, SmallVectorImpl> &ReadyList, SmallVectorImpl &SyncPipeline); - // Calculate best cost assignment of an unassigned SU without assigning it. - // The sum of these costs across SUs represents a Lower Bound on the true best - // cost for the set of unassigned SUs. - int calculateLowerBound(); // Add edges corresponding to the SchedGroups as assigned by solver void makePipeline(); // Add the edges from the SU to the other SchedGroups in pipeline, and // return the number of edges missed. int addEdges(SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, - SmallVectorImpl> &AddedEdges, - int BestCost = -1); + std::vector> &AddedEdges); // Remove the edges passed via AddedEdges - void removeEdges(SmallVectorImpl> &AddedEdges); + void removeEdges(const std::vector> &AddedEdges); // Convert the passed in maps to arrays for bidirectional iterators void convertSyncMapsToArrays(); @@ -416,7 +395,7 @@ void PipelineSolver::makePipeline() { int PipelineSolver::addEdges( SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, - SmallVectorImpl> &AddedEdges, int BestCost) { + std::vector> &AddedEdges) { int AddedCost = 0; bool MakePred = false; @@ -427,8 +406,6 @@ int PipelineSolver::addEdges( // linked as a predecessor of the subsequent SchedGroups auto GroupNo = (int)SyncPipeline.size() - 1; for (; GroupNo >= 0; GroupNo--) { - if (BestCost != -1 && AddedCost >= BestCost) - return AddedCost; if (SyncPipeline[GroupNo].getSGID() == SGID) { MakePred = true; continue; @@ -442,18 +419,15 @@ int PipelineSolver::addEdges( } void PipelineSolver::removeEdges( - SmallVectorImpl> &EdgesToRemove) { + const std::vector> &EdgesToRemove) { // Only remove the edges that we have added when testing // the fit. for (auto &PredSuccPair : EdgesToRemove) { SUnit *Pred = PredSuccPair.first; SUnit *Succ = PredSuccPair.second; - auto Match = - std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) { - return P.getSUnit() == Pred && P.isArtificial(); - }); - + auto Match = llvm::find_if( + Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); if (Match != Succ->Preds.end()) { assert(Match->isArtificial()); Succ->removePred(*Match); @@ -504,7 +478,7 @@ bool PipelineSolver::checkOptimal() { if (BestCost == -1 || CurrCost < BestCost) { BestPipeline = CurrPipeline; BestCost = CurrCost; - LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << '\n'); + LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); } assert(BestCost >= 0); } @@ -513,7 +487,7 @@ bool PipelineSolver::checkOptimal() { if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) DoneExploring = true; - return (DoneExploring || BestCost == StaticLowerBound); + return (DoneExploring || BestCost == 0); } void PipelineSolver::populateReadyList( @@ -522,9 +496,8 @@ void PipelineSolver::populateReadyList( assert(CurrSU.second.size() >= 1); auto I = CurrSU.second.rbegin(); auto E = CurrSU.second.rend(); - SmallVector, 16> AddedEdges; for (; I != E; ++I) { - + std::vector> AddedEdges; int CandSGID = *I; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -537,7 +510,6 @@ void PipelineSolver::populateReadyList( ReadyList.push_back(std::pair(*I, MissPenalty)); continue; } - AddedEdges.clear(); int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); ReadyList.push_back(std::pair(*I, TempCost)); @@ -556,52 +528,6 @@ void PipelineSolver::populateReadyList( assert(ReadyList.size() == CurrSU.second.size()); } -int PipelineSolver::calculateLowerBound() { - if (CurrSyncGroupIdx >= (int)CurrPipeline.size()) - return 0; - int TempConflInstNo = CurrConflInstNo; - int TmpSyncGroupIdx = CurrSyncGroupIdx; - int MinimumCost = 0; - SmallVector, 16> AddedEdges; - - for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) { - auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx]; - for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size(); - TempConflInstNo++) { - auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo]; - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); - int MinCostForSU = -1; - for (; I != E; I++) { - int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } - - if (Match->isFull()) { - if (MinCostForSU == -1 || MissPenalty < MinCostForSU) - MinCostForSU = MissPenalty; - continue; - } - AddedEdges.clear(); - int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, - AddedEdges, MinCostForSU); - if (MinCostForSU == -1 || TempCost < MinCostForSU) - MinCostForSU = TempCost; - - removeEdges(AddedEdges); - if (MinCostForSU == 0) - break; - } - MinimumCost += MinCostForSU; - } - TempConflInstNo = 0; - } - return MinimumCost; -} - bool PipelineSolver::solveExact() { if (checkOptimal()) return true; @@ -614,13 +540,12 @@ bool PipelineSolver::solveExact() { PipelineInstrs[CurrSyncGroupIdx].size()); SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << '\n'); + << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); // SchedGroup -> Cost pairs SmallVector, 4> ReadyList; // Prioritize the candidate sched groups in terms of lowest cost first populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); - SmallVector, 16> AddedEdges; auto I = ReadyList.begin(); auto E = ReadyList.end(); @@ -633,6 +558,7 @@ bool PipelineSolver::solveExact() { int CandSGID = I->first; int AddedCost = 0; + std::vector> AddedEdges; auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -645,22 +571,19 @@ bool PipelineSolver::solveExact() { LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID - << '\n'); + << "\n"); Match->add(*CurrSU.first); - AddedEdges.clear(); AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); - LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << '\n'); + LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); CurrCost += AddedCost; advancePosition(); ++BranchesExplored; bool FinishedExploring = false; // If the Cost after adding edges is greater than a known solution, // backtrack - int LBCost = - (EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0; - if (BestCost == -1 || CurrCost + LBCost < BestCost) { + if (CurrCost < BestCost || BestCost == -1) { if (solveExact()) { - FinishedExploring = BestCost != StaticLowerBound; + FinishedExploring = BestCost != 0; if (!FinishedExploring) return true; } @@ -686,7 +609,7 @@ bool PipelineSolver::solveExact() { bool FinishedExploring = false; if (CurrCost < BestCost || BestCost == -1) { if (solveExact()) { - bool FinishedExploring = BestCost != StaticLowerBound; + bool FinishedExploring = BestCost != 0; if (!FinishedExploring) return true; } @@ -699,7 +622,7 @@ bool PipelineSolver::solveExact() { bool PipelineSolver::solveGreedy() { BestCost = 0; - SmallVector, 16> AddedEdges; + std::vector> AddedEdges; while (static_cast(CurrSyncGroupIdx) < PipelineInstrs.size()) { SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; @@ -709,7 +632,7 @@ bool PipelineSolver::solveGreedy() { int BestGroupID = -1; auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << '\n'); + << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); // Since we have added the potential SchedGroups from bottom up, but // traversed the DAG from top down, parse over the groups from last to @@ -718,7 +641,7 @@ bool PipelineSolver::solveGreedy() { auto I = CurrSU.second.rbegin(); auto E = CurrSU.second.rend(); for (; I != E; ++I) { - SmallVector, 16> AddedEdges; + std::vector> AddedEdges; int CandSGID = *I; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -727,15 +650,14 @@ bool PipelineSolver::solveGreedy() { } LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " - << (int)Match->getMask() << '\n'); + << (int)Match->getMask() << "\n"); if (Match->isFull()) { LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); continue; } - TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges, - BestNodeCost); - LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << '\n'); + TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); + LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); if (TempCost < BestNodeCost || BestNodeCost == -1) { BestGroup = Match; BestNodeCost = TempCost; @@ -750,7 +672,7 @@ bool PipelineSolver::solveGreedy() { BestGroup->add(*CurrSU.first); addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" - << (int)BestGroup->getMask() << '\n'); + << (int)BestGroup->getMask() << "\n"); BestCost += TempCost; } else BestCost += MissPenalty; @@ -787,14 +709,11 @@ void PipelineSolver::solve() { LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); solveGreedy(); reset(); - LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << '\n'); - StaticLowerBound = calculateLowerBound(); - LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound - << '\n'); - if (BestCost > StaticLowerBound) { + LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); + if (BestCost > 0) { LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); solveExact(); - LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << '\n'); + LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); } } else { // Use the Greedy Algorithm by default LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); @@ -978,7 +897,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { } int SchedGroup::link(SUnit &SU, bool MakePred, - SmallVectorImpl> &AddedEdges) { + std::vector> &AddedEdges) { int MissedEdges = 0; for (auto *A : Collection) { SUnit *B = &SU; @@ -987,6 +906,10 @@ int SchedGroup::link(SUnit &SU, bool MakePred, if (MakePred) std::swap(A, B); + if (DAG->IsReachable(B, A)) + continue; + // tryAddEdge returns false if there is a dependency that makes adding + // the A->B edge impossible, otherwise it returns true; bool Added = tryAddEdge(A, B); if (Added) AddedEdges.push_back(std::pair(A, B)); diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll deleted file mode 100644 index bcdf2abc4ddc0..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll +++ /dev/null @@ -1,241 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACT %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s | FileCheck -check-prefix=LB %s - -define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 { -; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: -; EXACT: ; %bb.0: -; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1 -; EXACT-NEXT: s_waitcnt lgkmcnt(0) -; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: s_waitcnt vmcnt(1) -; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5 -; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: s_waitcnt vmcnt(0) -; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACT-NEXT: s_endpgm -; -; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: -; LB: ; %bb.0: -; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0 -; LB-NEXT: s_waitcnt lgkmcnt(0) -; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64 -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v11, v11, v11 -; LB-NEXT: v_mul_lo_u32 v10, v10, v10 -; LB-NEXT: v_mul_lo_u32 v9, v9, v9 -; LB-NEXT: v_mul_lo_u32 v8, v8, v8 -; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64 -; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v3, v3, v3 -; LB-NEXT: v_mul_lo_u32 v2, v2, v2 -; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32 -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v9, v9, v9 -; LB-NEXT: v_mul_lo_u32 v8, v8, v8 -; LB-NEXT: v_mul_lo_u32 v11, v11, v11 -; LB-NEXT: v_mul_lo_u32 v10, v10, v10 -; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32 -; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112 -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v7, v7, v7 -; LB-NEXT: v_mul_lo_u32 v6, v6, v6 -; LB-NEXT: v_mul_lo_u32 v1, v1, v1 -; LB-NEXT: v_mul_lo_u32 v0, v0, v0 -; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] -; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96 -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v3, v3, v3 -; LB-NEXT: v_mul_lo_u32 v2, v2, v2 -; LB-NEXT: v_mul_lo_u32 v1, v1, v1 -; LB-NEXT: v_mul_lo_u32 v0, v0, v0 -; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96 -; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80 -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v3, v3, v3 -; LB-NEXT: v_mul_lo_u32 v2, v2, v2 -; LB-NEXT: v_mul_lo_u32 v1, v1, v1 -; LB-NEXT: v_mul_lo_u32 v0, v0, v0 -; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80 -; LB-NEXT: v_mul_lo_u32 v5, v5, v5 -; LB-NEXT: v_mul_lo_u32 v4, v4, v4 -; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112 -; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48 -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v5, v5, v5 -; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16 -; LB-NEXT: v_mul_lo_u32 v4, v4, v4 -; LB-NEXT: s_waitcnt vmcnt(0) -; LB-NEXT: v_mul_lo_u32 v1, v1, v1 -; LB-NEXT: v_mul_lo_u32 v0, v0, v0 -; LB-NEXT: v_mul_lo_u32 v3, v3, v3 -; LB-NEXT: v_mul_lo_u32 v2, v2, v2 -; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16 -; LB-NEXT: v_mul_lo_u32 v7, v7, v7 -; LB-NEXT: v_mul_lo_u32 v6, v6, v6 -; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48 -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; LB-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 - %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid - %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1 - %mul = mul <32 x i32> %load, %load - %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid - store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2 - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ; 1 VMEM read - call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) - ; 2 VALU - call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) - ; 1 VMEM write - call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0 -declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0 - -attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}