Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,13 @@ def FeatureClusters : SubtargetFeature< "clusters",
"Has clusters of workgroups support"
>;

def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
"waits-before-system-scope-stores",
"RequiresWaitsBeforeSystemScopeStores",
"true",
"Target requires waits for loads and atomics before system scope stores"
>;

// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
Expand Down Expand Up @@ -2060,7 +2067,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureMaxHardClauseLength32,
Feature1_5xVGPRs,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureBVHDualAndBVH8Insts
FeatureBVHDualAndBVH8Insts,
FeatureWaitsBeforeSystemScopeStores,
]>;

def FeatureISAVersion12_50 : FeatureSet<
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool Has45BitNumRecordsBufferResource = false;

bool HasClusters = false;
bool RequiresWaitsBeforeSystemScopeStores = false;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
Expand Down Expand Up @@ -1861,6 +1862,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool has45BitNumRecordsBufferResource() const {
return Has45BitNumRecordsBufferResource;
}

bool requiresWaitsBeforeSystemScopeStores() const {
return RequiresWaitsBeforeSystemScopeStores;
}
};

class GCNUserSGPRUsageInfo {
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2673,7 +2673,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const unsigned Scope = CPol->getImm() & CPol::SCOPE;

// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
Scope == CPol::SCOPE_SYS)
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());

return Changed;
Expand Down
50 changes: 39 additions & 11 deletions llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
Original file line number Diff line number Diff line change
@@ -1,22 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@petar-avramovic this test added in #82996 is strange because it never actually showed any waits being inserted.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think waits are optimized out in final output of ll test, proper test where waits are inserted is mir version of the same test llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
Iirc test shows how to generate scope:SCOPE_SYS, options are volatile store or intinsic with argument that corresponds to the bitmask for scope

; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s

define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: intrinsic_store_system_scope:
; GFX12: ; %bb.0:
; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
; GFX1200-LABEL: intrinsic_store_system_scope:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
; GFX1200-NEXT: s_endpgm
;
; GFX1250-SDAG-LABEL: intrinsic_store_system_scope:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: buffer_store_b32 v0, v[2:3], s[0:3], s4 idxen offen scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: intrinsic_store_system_scope:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v[4:5], s[0:3], s4 idxen offen scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
ret void
}

define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
; GFX12-LABEL: generic_store_volatile:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
; GFX1200-LABEL: generic_store_volatile:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: s_endpgm
;
; GFX1250-SDAG-LABEL: generic_store_volatile:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: global_store_b32 v[2:3], v0, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: generic_store_volatile:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: global_store_b32 v[4:5], v0, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: s_endpgm
store volatile i32 %val, ptr addrspace(1) %out
ret void
}