From e8c35cf5383689c71baee2785e6170e149e128e7 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 18 Nov 2025 14:50:36 +0900 Subject: [PATCH] [AMDGPU] Ignore wavefront barrier latency DAG mutation Do not add latency for wavefront and singlethread scope fences during barrier latency DAG mutation. These scopes do not typically introduce any latency and adjusting schedules based on them significantly impacts latency hiding. --- .../Target/AMDGPU/AMDGPUBarrierLatency.cpp | 22 +- llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h | 5 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 +- .../AMDGPU/schedule-barrier-latency.mir | 279 +++++++++++++++++- 4 files changed, 301 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp index 30a1f05a8a390..2e586ea207af5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -27,8 +27,17 @@ using namespace llvm; namespace { class BarrierLatency : public ScheduleDAGMutation { +private: + SmallSet IgnoredScopes; + public: - BarrierLatency() = default; + BarrierLatency(MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + IgnoredScopes.insert(SyncScope::SingleThread); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + } void apply(ScheduleDAGInstrs *DAG) override; }; @@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { continue; // Update latency on barrier edges of ATOMIC_FENCE. - // We don't consider the scope of the fence or type of instruction - // involved in the barrier edge. + // Ignore scopes not expected to have any latency. + SyncScope::ID SSID = static_cast(MI->getOperand(1).getImm()); + if (IgnoredScopes.contains(SSID)) + continue; + for (SDep &PredDep : SU.Preds) { if (!PredDep.isBarrier()) continue; @@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { } // end namespace std::unique_ptr -llvm::createAMDGPUBarrierLatencyDAGMutation() { - return std::make_unique(); +llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique(MF); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h index c23f0b99fe822..547cd2a11f7df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -14,7 +14,10 @@ namespace llvm { -std::unique_ptr createAMDGPUBarrierLatencyDAGMutation(); +class MachineFunction; + +std::unique_ptr +createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5ff16e29bbbb1..0346580ffa684 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -647,7 +647,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -668,7 +668,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -1209,7 +1209,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir index 93f7bcc478737..30cc241b55271 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir @@ -1,16 +1,32 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s +# Ensure syncscope IDs defined in expected order +# SSID 2 = workgroup +# SSID 3 = wavefront +--- | + define amdgpu_cs void @test_workgroup() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } + define amdgpu_cs void @test_wavefront() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } +... + # Ensure WMMA operations stay before the final atomic fence and barrier group. # This allows the latency of the WMMA operations to be hidden by barrier wait. --- -name: test +name: test_workgroup tracksRegLiveness: true body: | bb.0: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 - ; CHECK-LABEL: name: test + ; CHECK-LABEL: name: test_workgroup ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ATOMIC_FENCE 5, 2 @@ -81,3 +97,262 @@ body: | ATOMIC_FENCE 4, 2 ... + +# Ensure VALU operations are not unduely redistributed between wavefront fences +# causing a loss of latency hiding. +--- +name: test_wavefront +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-LABEL: name: test_wavefront + ; CHECK: liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: BUNDLE implicit killed $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit killed $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 killed $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 killed $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + ; CHECK-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + ; CHECK-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + ; CHECK-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + ; CHECK-NEXT: $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + ; CHECK-NEXT: $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + ; CHECK-NEXT: $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + ; CHECK-NEXT: $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + ; CHECK-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + ; CHECK-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + ; CHECK-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + ; CHECK-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + ; CHECK-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + ; CHECK-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + ; CHECK-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + ; CHECK-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + ; CHECK-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + ; CHECK-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + ; CHECK-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit killed $vgpr7, implicit $exec, implicit killed $vgpr6 { + ; CHECK-NEXT: $vgpr32 = DS_READ_B32_gfx9 killed $vgpr7, 2168, 0, implicit $exec + ; CHECK-NEXT: $vgpr33 = DS_READ_B32_gfx9 killed $vgpr6, 1920, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec + ATOMIC_FENCE 6, 3 + ATOMIC_FENCE 6, 2 + S_BARRIER + BUNDLE implicit $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + DS_WRITE2_B32_gfx9 $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + DS_WRITE2ST64_B32_gfx9 $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + } + ATOMIC_FENCE 6, 2 + S_BARRIER + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr32 = DS_READ_B32_gfx9 $vgpr7, 2168, 0, implicit $exec + $vgpr33 = DS_READ_B32_gfx9 $vgpr6, 1920, 0, implicit $exec + } + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec +...