diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index cd295e04fac3f..5e55186eaada9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -38,6 +38,14 @@ static cl::opt cl::desc("Disable unclustred high register pressure " "reduction scheduling stage."), cl::init(false)); +static cl::opt ScheduleMetricBias( + "amdgpu-schedule-metric-bias", cl::Hidden, + cl::desc( + "Sets the bias which adds weight to occupancy vs latency. Set it to " + "100 to chase the occupancy only."), + cl::init(10)); + +const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), @@ -862,6 +870,7 @@ void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); + LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { @@ -925,6 +934,120 @@ void GCNSchedStage::checkScheduling() { } } +unsigned +GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, + DenseMap &ReadyCycles, + const TargetSchedModel &SM) { + unsigned ReadyCycle = CurrCycle; + for (auto &D : SU.Preds) { + if (D.isAssignedRegDep()) { + MachineInstr *DefMI = D.getSUnit()->getInstr(); + unsigned Latency = SM.computeInstrLatency(DefMI); + unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; + ReadyCycle = std::max(ReadyCycle, DefReady + Latency); + } + } + ReadyCycles[SU.NodeNum] = ReadyCycle; + return ReadyCycle; +} + +#ifndef NDEBUG +struct EarlierIssuingCycle { + bool operator()(std::pair A, + std::pair B) const { + return A.second < B.second; + } +}; + +static void printScheduleModel(std::set, + EarlierIssuingCycle> &ReadyCycles) { + if (ReadyCycles.empty()) + return; + unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); + dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum + << " ##################\n# Cycle #\t\t\tInstruction " + " " + " \n"; + unsigned IPrev = 1; + for (auto &I : ReadyCycles) { + if (I.second > IPrev + 1) + dbgs() << "****************************** BUBBLE OF " << I.second - IPrev + << " CYCLES DETECTED ******************************\n\n"; + dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; + IPrev = I.second; + } +} +#endif + +ScheduleMetrics +GCNSchedStage::getScheduleMetrics(const std::vector &InputSchedule) { +#ifndef NDEBUG + std::set, EarlierIssuingCycle> + ReadyCyclesSorted; +#endif + const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); + unsigned SumBubbles = 0; + DenseMap ReadyCycles; + unsigned CurrCycle = 0; + for (auto &SU : InputSchedule) { + unsigned ReadyCycle = + computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); + SumBubbles += ReadyCycle - CurrCycle; +#ifndef NDEBUG + ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); +#endif + CurrCycle = ++ReadyCycle; + } +#ifndef NDEBUG + LLVM_DEBUG( + printScheduleModel(ReadyCyclesSorted); + dbgs() << "\n\t" + << "Metric: " + << (SumBubbles + ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle + : 1) + << "\n\n"); +#endif + + return ScheduleMetrics(CurrCycle, SumBubbles); +} + +ScheduleMetrics +GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { +#ifndef NDEBUG + std::set, EarlierIssuingCycle> + ReadyCyclesSorted; +#endif + const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); + unsigned SumBubbles = 0; + DenseMap ReadyCycles; + unsigned CurrCycle = 0; + for (auto &MI : DAG) { + SUnit *SU = DAG.getSUnit(&MI); + if (!SU) + continue; + unsigned ReadyCycle = + computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); + SumBubbles += ReadyCycle - CurrCycle; +#ifndef NDEBUG + ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); +#endif + CurrCycle = ++ReadyCycle; + } +#ifndef NDEBUG + LLVM_DEBUG( + printScheduleModel(ReadyCyclesSorted); + dbgs() << "\n\t" + << "Metric: " + << (SumBubbles + ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle + : 1) + << "\n\n"); +#endif + + return ScheduleMetrics(CurrCycle, SumBubbles); +} + bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; @@ -955,7 +1078,28 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { return true; } - return false; + LLVM_DEBUG( + dbgs() + << "\n\t *** In shouldRevertScheduling ***\n" + << " *********** BEFORE UnclusteredHighRPStage ***********\n"); + ScheduleMetrics MBefore = + getScheduleMetrics(DAG.SUnits); + LLVM_DEBUG( + dbgs() + << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); + ScheduleMetrics MAfter = getScheduleMetrics(DAG); + unsigned OldMetric = MBefore.getMetric(); + unsigned NewMetric = MAfter.getMetric(); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); + unsigned Profit = + ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * + ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / + NewMetric) / + ScheduleMetrics::ScaleFactor; + LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " + << MAfter << "Profit: " << Profit << "\n"); + return Profit < ScheduleMetrics::ScaleFactor; } bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 2249138c7075a..d7b161ccbf07c 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -126,6 +126,33 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy { GCNMaxILPSchedStrategy(const MachineSchedContext *C); }; +class ScheduleMetrics { + unsigned ScheduleLength; + unsigned BubbleCycles; + +public: + ScheduleMetrics() {} + ScheduleMetrics(unsigned L, unsigned BC) + : ScheduleLength(L), BubbleCycles(BC) {} + unsigned getLength() const { return ScheduleLength; } + unsigned getBubbles() const { return BubbleCycles; } + unsigned getMetric() const { + unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength; + // Metric is zero if the amount of bubbles is less than 1% which is too + // small. So, return 1. + return Metric ? Metric : 1; + } + static const unsigned ScaleFactor; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { + dbgs() << "\n Schedule Metric (scaled by " + << ScheduleMetrics::ScaleFactor + << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/" + << Sm.getLength() << " ]\n"; + return OS; +} + class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -259,6 +286,13 @@ class GCNSchedStage { // Check result of scheduling. void checkScheduling(); + // computes the given schedule virtual execution time in clocks + ScheduleMetrics getScheduleMetrics(const std::vector &InputSchedule); + ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG); + unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, + DenseMap &ReadyCycles, + const TargetSchedModel &SM); + // Returns true if scheduling should be reverted. virtual bool shouldRevertScheduling(unsigned WavesAfter); diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 72f20109350cb..5a8a3114a0a9b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6622,10 +6622,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a46abf6770bbc..271991b983244 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -449,101 +449,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: s_movk_i32 s4, 0x7f +; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v3 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[0:1] -; GFX8-NEXT: s_addk_i32 s5, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[11:12] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffd000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v14, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[13:14] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v9, v15 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v10, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffd800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v6, vcc, -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v3 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: s_addk_i32 s1, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xffffe800, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xfffff000, v3 +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v15 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffe000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v10, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffe800, v3 -; GFX8-NEXT: s_mov_b64 s[2:3], vcc -; GFX8-NEXT: v_addc_u32_e64 v8, vcc, -1, v4, s[0:1] -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v5, v13 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xfffff000, v3 -; GFX8-NEXT: s_mov_b64 s[0:1], vcc -; GFX8-NEXT: v_addc_u32_e64 v12, vcc, -1, v4, s[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v9, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v10, v6, vcc -; GFX8-NEXT: v_addc_u32_e64 v6, s[0:1], -1, v4, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v9, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v10, v22, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xfffff800, v3 -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v7, v13 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v8, v14, vcc -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v11, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v12, v22, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x10000, v3 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v21 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v22, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v15, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v16, v14, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v17, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v18, v14, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v19, v13 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v20, v14, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v12, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v9, v5 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s0, s4, -1 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_add_i32 s1, s0, -1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s0, s1 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 @@ -600,63 +591,61 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: v_addc_co_u32_e64 v8, s[0:1], -1, v4, s[0:1] +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[9:10], v[3:4], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[11:12], v[3:4], off offset:-2048 ; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v3 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off ; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v3 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v4, vcc +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v3 +; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v4, vcc ; GFX900-NEXT: s_addk_i32 s6, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v6, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[13:14], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v5, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v8, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[13:14], off -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v3 -; GFX900-NEXT: s_mov_b64 s[0:1], vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v6, v15 -; GFX900-NEXT: v_addc_co_u32_e64 v6, s[0:1], -1, v4, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v7, v8, vcc -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, s3, v3 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v4, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[5:6], v[7:8], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v5, v13 -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v6, v14, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 +; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v6, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s5, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v4, vcc ; GFX900-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, v7, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v14, vcc -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[3:4], off +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[3:4], off +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc ; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, 0x10000, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v13 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v8, vcc ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v9, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v6, vcc ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v12, v6, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, v13, v5 +; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, v14, v6, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1