diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp index 30a1f05a8a390..2e586ea207af5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -27,8 +27,17 @@ using namespace llvm; namespace { class BarrierLatency : public ScheduleDAGMutation { +private: + SmallSet IgnoredScopes; + public: - BarrierLatency() = default; + BarrierLatency(MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + IgnoredScopes.insert(SyncScope::SingleThread); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + } void apply(ScheduleDAGInstrs *DAG) override; }; @@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { continue; // Update latency on barrier edges of ATOMIC_FENCE. - // We don't consider the scope of the fence or type of instruction - // involved in the barrier edge. + // Ignore scopes not expected to have any latency. + SyncScope::ID SSID = static_cast(MI->getOperand(1).getImm()); + if (IgnoredScopes.contains(SSID)) + continue; + for (SDep &PredDep : SU.Preds) { if (!PredDep.isBarrier()) continue; @@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { } // end namespace std::unique_ptr -llvm::createAMDGPUBarrierLatencyDAGMutation() { - return std::make_unique(); +llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique(MF); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h index c23f0b99fe822..547cd2a11f7df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -14,7 +14,10 @@ namespace llvm { -std::unique_ptr createAMDGPUBarrierLatencyDAGMutation(); +class MachineFunction; + +std::unique_ptr +createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5ff16e29bbbb1..0346580ffa684 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -647,7 +647,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -668,7 +668,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -1209,7 +1209,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir index 93f7bcc478737..30cc241b55271 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir @@ -1,16 +1,32 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s +# Ensure syncscope IDs defined in expected order +# SSID 2 = workgroup +# SSID 3 = wavefront +--- | + define amdgpu_cs void @test_workgroup() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } + define amdgpu_cs void @test_wavefront() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } +... + # Ensure WMMA operations stay before the final atomic fence and barrier group. # This allows the latency of the WMMA operations to be hidden by barrier wait. --- -name: test +name: test_workgroup tracksRegLiveness: true body: | bb.0: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 - ; CHECK-LABEL: name: test + ; CHECK-LABEL: name: test_workgroup ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ATOMIC_FENCE 5, 2 @@ -81,3 +97,262 @@ body: | ATOMIC_FENCE 4, 2 ... + +# Ensure VALU operations are not unduely redistributed between wavefront fences +# causing a loss of latency hiding. +--- +name: test_wavefront +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-LABEL: name: test_wavefront + ; CHECK: liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: BUNDLE implicit killed $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit killed $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 killed $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 killed $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + ; CHECK-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + ; CHECK-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + ; CHECK-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + ; CHECK-NEXT: $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + ; CHECK-NEXT: $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + ; CHECK-NEXT: $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + ; CHECK-NEXT: $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + ; CHECK-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + ; CHECK-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + ; CHECK-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + ; CHECK-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + ; CHECK-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + ; CHECK-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + ; CHECK-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + ; CHECK-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + ; CHECK-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + ; CHECK-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + ; CHECK-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit killed $vgpr7, implicit $exec, implicit killed $vgpr6 { + ; CHECK-NEXT: $vgpr32 = DS_READ_B32_gfx9 killed $vgpr7, 2168, 0, implicit $exec + ; CHECK-NEXT: $vgpr33 = DS_READ_B32_gfx9 killed $vgpr6, 1920, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec + ATOMIC_FENCE 6, 3 + ATOMIC_FENCE 6, 2 + S_BARRIER + BUNDLE implicit $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + DS_WRITE2_B32_gfx9 $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + DS_WRITE2ST64_B32_gfx9 $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + } + ATOMIC_FENCE 6, 2 + S_BARRIER + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr32 = DS_READ_B32_gfx9 $vgpr7, 2168, 0, implicit $exec + $vgpr33 = DS_READ_B32_gfx9 $vgpr6, 1920, 0, implicit $exec + } + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec +...