-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Ignore wavefront barrier latency during scheduling DAG mutation #168500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Do not add latency for wavefront and singlethread scope fences during barrier latency DAG mutation. These scopes do not typically introduce any latency and adjusting schedules based on them significantly impacts latency hiding.
|
@llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) ChangesDo not add latency for wavefront and singlethread scope fences during barrier latency DAG mutation. Patch is 23.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168500.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
index 30a1f05a8a390..2e586ea207af5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -27,8 +27,17 @@ using namespace llvm;
namespace {
class BarrierLatency : public ScheduleDAGMutation {
+private:
+ SmallSet<SyncScope::ID, 4> IgnoredScopes;
+
public:
- BarrierLatency() = default;
+ BarrierLatency(MachineFunction *MF) {
+ LLVMContext &Context = MF->getFunction().getContext();
+ IgnoredScopes.insert(SyncScope::SingleThread);
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
+ IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
+ }
void apply(ScheduleDAGInstrs *DAG) override;
};
@@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
continue;
// Update latency on barrier edges of ATOMIC_FENCE.
- // We don't consider the scope of the fence or type of instruction
- // involved in the barrier edge.
+ // Ignore scopes not expected to have any latency.
+ SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+ if (IgnoredScopes.contains(SSID))
+ continue;
+
for (SDep &PredDep : SU.Preds) {
if (!PredDep.isBarrier())
continue;
@@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
} // end namespace
std::unique_ptr<ScheduleDAGMutation>
-llvm::createAMDGPUBarrierLatencyDAGMutation() {
- return std::make_unique<BarrierLatency>();
+llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {
+ return std::make_unique<BarrierLatency>(MF);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
index c23f0b99fe822..547cd2a11f7df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -14,7 +14,10 @@
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5ff16e29bbbb1..0346580ffa684 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -647,7 +647,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
@@ -668,7 +668,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
@@ -1209,7 +1209,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
- DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
return DAG;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
index 93f7bcc478737..30cc241b55271 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -1,16 +1,32 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+# Ensure syncscope IDs defined in expected order
+# SSID 2 = workgroup
+# SSID 3 = wavefront
+--- |
+ define amdgpu_cs void @test_workgroup() {
+ fence syncscope("workgroup") acq_rel
+ fence syncscope("wavefront") acq_rel
+ ret void
+ }
+ define amdgpu_cs void @test_wavefront() {
+ fence syncscope("workgroup") acq_rel
+ fence syncscope("wavefront") acq_rel
+ ret void
+ }
+...
+
# Ensure WMMA operations stay before the final atomic fence and barrier group.
# This allows the latency of the WMMA operations to be hidden by barrier wait.
---
-name: test
+name: test_workgroup
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
- ; CHECK-LABEL: name: test
+ ; CHECK-LABEL: name: test_workgroup
; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ATOMIC_FENCE 5, 2
@@ -81,3 +97,262 @@ body: |
ATOMIC_FENCE 4, 2
...
+
+# Ensure VALU operations are not unduely redistributed between wavefront fences
+# causing a loss of latency hiding.
+---
+name: test_wavefront
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14
+ ; CHECK-LABEL: name: test_wavefront
+ ; CHECK: liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: BUNDLE implicit killed $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit killed $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 {
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 killed $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec
+ ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 killed $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 2
+ ; CHECK-NEXT: S_BARRIER
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ ; CHECK-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: ATOMIC_FENCE 6, 3
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit killed $vgpr7, implicit $exec, implicit killed $vgpr6 {
+ ; CHECK-NEXT: $vgpr32 = DS_READ_B32_gfx9 killed $vgpr7, 2168, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr33 = DS_READ_B32_gfx9 killed $vgpr6, 1920, 0, implicit $exec
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec
+ ATOMIC_FENCE 6, 3
+ ATOMIC_FENCE 6, 2
+ S_BARRIER
+ BUNDLE implicit $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 {
+ DS_WRITE2_B32_gfx9 $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec
+ DS_WRITE2ST64_B32_gfx9 $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 2
+ S_BARRIER
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec
+ $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec
+ $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec
+ $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec
+ $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec
+ $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec
+ $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec
+ $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec
+ $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec
+ $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec
+ $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec
+ $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec
+ $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec
+ $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec
+ $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 {
+ $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec
+ $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec
+ }
+ ATOMIC_FENCE 6, 3
+ BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr...
[truncated]
|
🐧 Linux x64 Test Results
|
Do not add latency for wavefront and singlethread scope fences during barrier latency DAG mutation.
These scopes do not typically introduce any latency and adjusting schedules based on them significantly impacts latency hiding.