Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads #77443

Closed
wants to merge 3 commits into from
Closed

[AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads #77443

wants to merge 3 commits into from

Conversation

jayfoad
Copy link
Contributor

@jayfoad jayfoad commented Jan 9, 2024

No description provided.

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 9, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

Patch is 45.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77443.diff

11 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SMInstructions.td (-2)
  • (modified) llvm/test/CodeGen/AMDGPU/addrspacecast.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll (+18-18)
  • (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+16-16)
  • (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+41-41)
  • (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+11-11)
  • (modified) llvm/test/CodeGen/AMDGPU/pack.v2f16.ll (+6-6)
  • (modified) llvm/test/CodeGen/AMDGPU/pack.v2i16.ll (+6-6)
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2c2..3c5721d1509752 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   let mayStore = 0;
   let mayLoad = 1;
   let hasSideEffects = 0;
-  let maybeAtomic = 0;
   let UseNamedOperandTable = 1;
   let SchedRW = [WriteSMEM];
 
@@ -248,7 +247,6 @@ class SM_Atomic_Pseudo <string opName,
   // Should these be set?
   let ScalarStore = 1;
   let hasSideEffects = 1;
-  let maybeAtomic = 1;
 
   let IsAtomicNoRet = !not(isRet);
   let IsAtomicRet = isRet;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b3165f57546fb..0d00b3dd2f92d2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -366,10 +366,10 @@ define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32
 ; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
 define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile ptr addrspace(4), ptr addrspace(4) %ptr.ptr
   %addrspacecast = addrspacecast ptr addrspace(4) %ptr to ptr addrspace(6)
@@ -381,10 +381,10 @@ define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspac
 ; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
 define amdgpu_kernel void @use_global_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) %ptr.ptr
   %addrspacecast = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(6)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 2f637df4e93022..ec37af30229219 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -382,7 +382,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GCN-NEXT:    s_cbranch_vccnz .LBB7_2
 ; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 3
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index f5846c3d6db737..fcd14f12841d51 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -11,7 +11,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX67-NEXT:  .LBB0_1: ; %loop
 ; GFX67-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    s_load_dword s3, s[0:1], 0x64
+; GFX67-NEXT:    s_load_dword s3, s[0:1], 0x64 glc
 ; GFX67-NEXT:    s_add_i32 s2, s2, -1
 ; GFX67-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX67-NEXT:    s_cbranch_scc1 .LBB0_1
@@ -23,7 +23,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX89-NEXT:  .LBB0_1: ; %loop
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_load_dword s3, s[0:1], 0x190
+; GFX89-NEXT:    s_load_dword s3, s[0:1], 0x190 glc
 ; GFX89-NEXT:    s_add_i32 s2, s2, -1
 ; GFX89-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX89-NEXT:    s_cbranch_scc1 .LBB0_1
@@ -35,7 +35,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX12-NEXT:  .LBB0_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x190 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -65,7 +65,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX6-NEXT:  .LBB1_1: ; %loop
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX6-NEXT:    s_add_i32 s2, s2, -1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -77,7 +77,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX7-NEXT:  .LBB1_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x3e8
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x3e8 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -89,7 +89,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX89-NEXT:  .LBB1_1: ; %loop
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_load_dword s3, s[0:1], 0xfa0
+; GFX89-NEXT:    s_load_dword s3, s[0:1], 0xfa0 glc
 ; GFX89-NEXT:    s_add_i32 s2, s2, -1
 ; GFX89-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX89-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -101,7 +101,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX12-NEXT:  .LBB1_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0xfa0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0xfa0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -131,7 +131,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX689-NEXT:  .LBB2_1: ; %loop
 ; GFX689-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX689-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX689-NEXT:    s_add_i32 s2, s2, -1
 ; GFX689-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX689-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -143,7 +143,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX7-NEXT:  .LBB2_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xf4240
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xf4240 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -155,7 +155,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX12-NEXT:  .LBB2_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x3d0900
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x3d0900 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -185,7 +185,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX689-NEXT:  .LBB3_1: ; %loop
 ; GFX689-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX689-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX689-NEXT:    s_add_i32 s2, s2, -1
 ; GFX689-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX689-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -197,7 +197,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX7-NEXT:  .LBB3_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x989680
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x989680 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -210,7 +210,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX12-NEXT:  .LBB3_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -240,7 +240,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
 ; GFX6789-NEXT:  .LBB4_1: ; %loop
 ; GFX6789-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6789-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6789-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX6789-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX6789-NEXT:    s_add_i32 s2, s2, -1
 ; GFX6789-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6789-NEXT:    s_cbranch_scc1 .LBB4_1
@@ -256,7 +256,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
 ; GFX12-NEXT:  .LBB4_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -286,7 +286,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX678-NEXT:  .LBB5_1: ; %loop
 ; GFX678-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX678-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX678-NEXT:    s_add_i32 s2, s2, -1
 ; GFX678-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX678-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -298,7 +298,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX9-NEXT:  .LBB5_1: ; %loop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190
+; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190 glc
 ; GFX9-NEXT:    s_add_i32 s2, s2, -1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -310,7 +310,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX12-NEXT:  .LBB5_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index e25a4b301537f7..8ff6971a1d7da0 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -264,8 +264,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
@@ -279,8 +279,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -292,8 +292,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
@@ -305,8 +305,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -548,8 +548,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
@@ -563,8 +563,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -576,8 +576,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
@@ -589,8 +589,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index c744ace37a8315..e3d594f978f668 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -9,7 +9,7 @@ define i32 @s_add_co_select_user() {
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0 glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s6, s6
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
@@ -28,7 +28,7 @@ define i32 @s_add_co_select_user() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], s6, s6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
@@ -46,7 +46,7 @@ define i32 @s_add_co_select_user() {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 0
-; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0 glc dlc
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, s4, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
@@ -63,7 +63,7 @@ define i32 @s_add_co_select_user() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s1, s0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee27981710..35bafd8d7df87c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -564,7 +564,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GFX789-LABEL: v8i32_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -576,7 +576,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GFX11-LABEL: v8i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -594,7 +594,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX789-LABEL: v16i32_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -608,7 +608,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX11-LABEL: v16i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -628,7 +628,7 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GFX789-LABEL: v32i32_func_void:
 ; GFX789:       ; %b...
[truncated]

@jayfoad
Copy link
Contributor Author

jayfoad commented Jan 9, 2024

I think this makes sense by analogy with VMEM loads, but I'd appreciate a sanity check. @nhaehnle @perlfu @t-tye

@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let mayStore = 0;
let mayLoad = 1;
let hasSideEffects = 0;
let maybeAtomic = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does removing this cause SIMemoryLegalizer to process scalar loads and stores?

Should the memory model in AMDGPUUsage be updated to include the scalar memory instructions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does removing this cause SIMemoryLegalizer to process scalar loads and stores?

Not setting maybeAtomic = 0 on SMEM Pseudos means that they all inherit the default of maybeAtomic = 1. The main loop in SIMemoryLegalizer::runOnMachineFunction ignores anything that is not marked as maybeAtomic.

Should the memory model in AMDGPUUsage be updated to include the scalar memory instructions?

Probably. Are you saying that should be a prerequisite for this patch?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think if we change the implementation of the memory model we should ensure the ABI documentation is updated to match. I think only the non-atomic rows need to be updated to include the SMEM instructions since we do not support scalar atomics.

Copy link
Contributor

@perlfu perlfu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code changes make sense to me.
It does seem the memory does not explicitly cover SMEM at the moment.
Adding cache bypass bits certainly shouldn't make the code gen any less correct.

Not really a problem with this patch, but I cannot convince myself that RT_NT is the right temporal hint on GFX12 for volatile -- I would have expected perhaps LU. Seems some kind of scope might also be needed.

Copy link
Collaborator

@nhaehnle nhaehnle left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense to me from a memory model perspective.

Copy link
Contributor Author

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made a start on updating the memory model but there are lots of things I am unsure about.

@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
be reordered by
hardware.

load *none* *none* - constant - !volatile & !nontemporal
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this should say "constant address space and only if the compiler can prove that the address is uniform", but I'm not sure where to fit that into the table.

Should I mention scalar stores in the table? The text above says "Since the constant address space contents do not change during the execution of a kernel dispatch it is not legal to perform
stores".

Copy link
Collaborator

@t-tye t-tye Jan 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK we do not generate scalar stores as the instruction was removed from the isa.

I do not think you need to say "only if the address is uniform" as the compiler cannot generate scalar loads unless that condition is already met.


- volatile

1. s_load/s_buffer_load glc=1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The VMEM case adds a waitcnt. Do we need to add s_waitcnt lgkmcnt(0) here? That still wouldn't prevent reordering of two volatile accesses, one of which used global_load and one which used s_load.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a waitcny lgkm(0) here for the same reason. It is waiting for the proceeding scalar load to complete before continuing. That ensures each volatile memory is complete before moving to the next one. There is no need to wait for VMEM as any previous VMEM will have been followed by its own waitcnt vmem(0).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I have documented that but I don't understand SIGfx6CacheControl::insertWait well enough to implement it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the spec looks fine to me. But you must admit that the concept of a volatile constant load is a bit meaningless. Not sure how it could ever happen. By definition a volatile location cannot be constant. That may be why I left scalar out of the memory model description.

Presumably this cahnge needs to be done to the memory model specification for the other targets too.

@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
be reordered by
hardware.

load *none* *none* - constant - !volatile & !nontemporal
Copy link
Collaborator

@t-tye t-tye Jan 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK we do not generate scalar stores as the instruction was removed from the isa.

I do not think you need to say "only if the address is uniform" as the compiler cannot generate scalar loads unless that condition is already met.


- volatile

1. s_load/s_buffer_load glc=1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a waitcny lgkm(0) here for the same reason. It is waiting for the proceeding scalar load to complete before continuing. That ensures each volatile memory is complete before moving to the next one. There is no need to wait for VMEM as any previous VMEM will have been followed by its own waitcnt vmem(0).

@jayfoad jayfoad closed this by deleting the head repository Jan 29, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

6 participants