diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 10ffbe281beac..fd222a206c4e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -209,18 +209,14 @@ class SchedGroup { // Remove last element in the SchedGroup void pop() { Collection.pop_back(); } - // Identify and add all relevant SUs from the DAG to this SchedGroup. - void initSchedGroup(); + template + void findCandidateSUnits(T Begin, T End, + SUnitsToCandidateSGsMap &SyncedInstrs); - // Add instructions to the SchedGroup bottom up starting from RIter. - // PipelineInstrs is a set of instructions that should not be added to the - // SchedGroup even when the other conditions for adding it are satisfied. - // RIter will be added to the SchedGroup as well, and dependencies will be - // added so that RIter will always be scheduled at the end of the group. - void initSchedGroup(std::vector::reverse_iterator RIter, - SUnitsToCandidateSGsMap &SyncedInstrs); - - void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); + /// Find each SUnit in the DAG that could potentially be added to + /// this SchedGroup and add the SGID to the candidate SchedGroups + /// for SU in \p SyncedInstrs. + void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs); int getSyncID() { return SyncID; } @@ -876,11 +872,11 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( for (unsigned I = 0; I < MFMACount * 3; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } return true; @@ -1597,7 +1593,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule( std::make_shared(1, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); // Second Round FMA SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1610,7 +1606,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(MFMAEnablement + 1, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } if (UsesDSRead) { @@ -1618,7 +1614,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(*FirstPipeDSR, TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // First Round EXP @@ -1632,7 +1628,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(TII, SG->getSGID(), true)); SG->addRule(std::make_shared(8, TII, SG->getSGID(), HasChainBetweenCvt)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); incrementTransPosition(); @@ -1649,7 +1645,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( else SG->addRule(std::make_shared( 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Third Round FMA @@ -1664,7 +1660,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(2 * MFMAEnablement + 1, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Second Round EXP @@ -1680,7 +1676,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(TII, SG->getSGID(), true)); SG->addRule(std::make_shared(8, TII, SG->getSGID(), HasChainBetweenCvt)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // The "extra" EXP which enables all MFMA @@ -1690,7 +1686,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(TII, SG->getSGID(), true)); SG->addRule(std::make_shared( 8, TII, SG->getSGID(), HasChainBetweenCvt)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); // PHASE 2: Main Interleave Loop @@ -1727,14 +1723,14 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->getSGID(), true)); else SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); incrementMFMAPosition(); if (UsesVALU) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } if (UsesDSRead && !(I % 4)) { @@ -1742,7 +1738,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(*FirstPipeDSR, TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // CVT, EXP, FMA Interleaving @@ -1770,7 +1766,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( else SG->addRule(std::make_shared(CurrentOffset, TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Round N + 3 FMA @@ -1787,7 +1783,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Round N + 2 Exp @@ -1804,7 +1800,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG->addRule(std::make_shared(TII, SG->getSGID(), true)); SG->addRule(std::make_shared(8, TII, SG->getSGID(), HasChainBetweenCvt)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } } @@ -1812,7 +1808,7 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); return true; } @@ -1848,11 +1844,11 @@ bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy( for (unsigned I = 0; I < MFMACount * 3; ++I) { SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } return true; @@ -2161,11 +2157,11 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( for (unsigned I = 0; I < MFMACount; I++) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } } @@ -2178,21 +2174,21 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); // Interleave MFMA with DS_READ prefetch for (unsigned I = 4; I < DSRCount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Phase 2a: Loop carried dependency with V_PERM @@ -2202,34 +2198,34 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 1, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 3, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Phase 2b: Loop carried dependency without V_PERM @@ -2238,16 +2234,16 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( for (unsigned I = DSWWithPermCount; I < DSWCount; I++) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are @@ -2259,52 +2255,52 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID(), true)); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 2, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 4, TII, SG->getSGID(), true)); SG->addRule(std::make_shared(TII, SG->getSGID())); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); - SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); } return true; @@ -2572,43 +2568,17 @@ bool SchedGroup::canAddSU(SUnit &SU) const { return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); } -void SchedGroup::initSchedGroup() { - for (auto &SU : DAG->SUnits) { - if (isFull()) - break; - - if (canAddSU(SU)) - add(SU); - } -} - -void SchedGroup::initSchedGroup(std::vector::reverse_iterator RIter, - SUnitsToCandidateSGsMap &SyncedInstrs) { - SUnit &InitSU = *RIter; - for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { - auto &SU = *RIter; - if (isFull()) - break; - +template +void SchedGroup::findCandidateSUnits(T Begin, T End, + SUnitsToCandidateSGsMap &SyncedInstrs) { + for (SUnit &SU : make_range(Begin, End)) { if (canAddSU(SU)) SyncedInstrs[&SU].push_back(SGID); } - - add(InitSU); - assert(MaxSize); - (*MaxSize)++; } -void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { - auto I = DAG->SUnits.rbegin(); - auto E = DAG->SUnits.rend(); - for (; I != E; ++I) { - auto &SU = *I; - if (isFull()) - break; - if (canAddSU(SU)) - SyncedInstrs[&SU].push_back(SGID); - } +void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) { + findCandidateSUnits(DAG->SUnits.rbegin(), DAG->SUnits.rend(), SyncedInstrs); } void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { @@ -2654,14 +2624,15 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { MachineInstr &MI = *SchedBarrier.getInstr(); assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); - // Remove all existing edges from the SCHED_BARRIER that were added due to the - // instruction having side effects. LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " << MI.getOperand(0).getImm() << "\n"); auto InvertedMask = invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); - SG.initSchedGroup(); + + for (SUnit &SU : DAG->SUnits) + if (SG.canAddSU(SU)) + SG.add(SU); // Preserve original instruction ordering relative to the SCHED_BARRIER. SG.link( @@ -2711,18 +2682,18 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( std::vector::reverse_iterator RIter) { - // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due - // to the instruction having side effects. MachineInstr &SGB = *RIter->getInstr(); assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); int32_t SGMask = SGB.getOperand(0).getImm(); int32_t Size = SGB.getOperand(1).getImm(); int32_t SyncID = SGB.getOperand(2).getImm(); + Size++; // Make room for the SCHED_GROUP_BARRIER instruction auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, Size, SyncID, DAG, TII); - - SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); + SG.add(*RIter); + SG.findCandidateSUnits(RIter, SG.DAG->SUnits.rend(), + SyncedInstrs[SG.getSyncID()]); } bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {