Skip to content

Commit

Permalink
[AMDGPU] Cluster stores as well as loads for GFX11
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D128517
  • Loading branch information
jayfoad committed Jun 27, 2022
1 parent aa89bb3 commit b5818e4
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 0 deletions.
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -396,9 +396,12 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {

static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
Expand All @@ -408,9 +411,12 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {

static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}

Expand All @@ -421,9 +427,12 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {

static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
Expand Down Expand Up @@ -898,6 +907,8 @@ class GCNPassConfig final : public AMDGPUPassConfig {
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
Expand Down Expand Up @@ -1102,8 +1113,11 @@ bool AMDGPUPassConfig::addGCPasses() {

llvm::ScheduleDAGInstrs *
AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}

Expand Down
119 changes: 119 additions & 0 deletions llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Expand Up @@ -3,6 +3,8 @@
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s
; RUN: FileCheck --enable-var-scope --check-prefixes=DBG,DBG11 %s < %t
; REQUIRES: asserts

; FIXME: Verifier error with xnack enabled.
Expand All @@ -22,6 +24,10 @@
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])

; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])

; DBG-NOT: Cluster ld/st

define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) {
Expand Down Expand Up @@ -93,6 +99,31 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX10-NEXT: flat_store_dword v[6:7], v11
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cluster_load_cluster_store:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8
; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16
; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v3 offset:8
; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
%ld0 = load i32, i32* %la0
Expand Down Expand Up @@ -130,6 +161,10 @@ bb:
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])

; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])

; DBG-NOT: Cluster ld/st

define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) {
Expand Down Expand Up @@ -203,6 +238,33 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX10-NEXT: flat_store_dword v[6:7], v10
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cluster_load_valu_cluster_store:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16
; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2
; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: flat_store_b32 v[0:1], v3
; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:8
; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
%ld0 = load i32, i32* %la0
Expand Down Expand Up @@ -266,6 +328,23 @@ define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cluster_image_load:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v0
; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1
; GFX11-NEXT: v_add_nc_u32_e32 v6, 2, v0
; GFX11-NEXT: v_add_nc_u32_e32 v7, 2, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_endpgm
entry:
%x1 = add i32 %x, 1
%y1 = add i32 %y, 1
Expand Down Expand Up @@ -309,6 +388,19 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: no_cluster_image_load:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_endpgm
entry:
%val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0)
%val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0)
Expand Down Expand Up @@ -377,6 +469,33 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
; GFX10-NEXT: v_add_f32_e32 v2, v14, v18
; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cluster_image_sample:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v10, 1.0
; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8
; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v7, v4
; GFX11-NEXT: v_add_f32_e32 v8, 2.0, v8
; GFX11-NEXT: v_add_f32_e32 v9, 2.0, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v10
; GFX11-NEXT: v_mov_b32_e32 v12, v10
; GFX11-NEXT: v_mov_b32_e32 v13, v10
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_endpgm
entry:
%s = sitofp i32 %x to float
%t = sitofp i32 %y to float
Expand Down

0 comments on commit b5818e4

Please sign in to comment.