-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads #77443
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesPatch is 45.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77443.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2c2..3c5721d1509752 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let mayStore = 0;
let mayLoad = 1;
let hasSideEffects = 0;
- let maybeAtomic = 0;
let UseNamedOperandTable = 1;
let SchedRW = [WriteSMEM];
@@ -248,7 +247,6 @@ class SM_Atomic_Pseudo <string opName,
// Should these be set?
let ScalarStore = 1;
let hasSideEffects = 1;
- let maybeAtomic = 1;
let IsAtomicNoRet = !not(isRet);
let IsAtomicRet = isRet;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b3165f57546fb..0d00b3dd2f92d2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -366,10 +366,10 @@ define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32
; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
%ptr = load volatile ptr addrspace(4), ptr addrspace(4) %ptr.ptr
%addrspacecast = addrspacecast ptr addrspace(4) %ptr to ptr addrspace(6)
@@ -381,10 +381,10 @@ define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspac
; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
define amdgpu_kernel void @use_global_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) %ptr.ptr
%addrspacecast = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(6)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 2f637df4e93022..ec37af30229219 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -382,7 +382,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_cbranch_vccnz .LBB7_2
; GCN-NEXT: ; %bb.1: ; %bb1
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 3
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index f5846c3d6db737..fcd14f12841d51 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -11,7 +11,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX67-NEXT: .LBB0_1: ; %loop
; GFX67-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_load_dword s3, s[0:1], 0x64
+; GFX67-NEXT: s_load_dword s3, s[0:1], 0x64 glc
; GFX67-NEXT: s_add_i32 s2, s2, -1
; GFX67-NEXT: s_cmp_lg_u32 s2, 0
; GFX67-NEXT: s_cbranch_scc1 .LBB0_1
@@ -23,7 +23,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX89-NEXT: .LBB0_1: ; %loop
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_load_dword s3, s[0:1], 0x190
+; GFX89-NEXT: s_load_dword s3, s[0:1], 0x190 glc
; GFX89-NEXT: s_add_i32 s2, s2, -1
; GFX89-NEXT: s_cmp_lg_u32 s2, 0
; GFX89-NEXT: s_cbranch_scc1 .LBB0_1
@@ -35,7 +35,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX12-NEXT: .LBB0_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -65,7 +65,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX6-NEXT: .LBB1_1: ; %loop
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX6-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX6-NEXT: s_add_i32 s2, s2, -1
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_cbranch_scc1 .LBB1_1
@@ -77,7 +77,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: .LBB1_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0x3e8
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0x3e8 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB1_1
@@ -89,7 +89,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX89-NEXT: .LBB1_1: ; %loop
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_load_dword s3, s[0:1], 0xfa0
+; GFX89-NEXT: s_load_dword s3, s[0:1], 0xfa0 glc
; GFX89-NEXT: s_add_i32 s2, s2, -1
; GFX89-NEXT: s_cmp_lg_u32 s2, 0
; GFX89-NEXT: s_cbranch_scc1 .LBB1_1
@@ -101,7 +101,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX12-NEXT: .LBB1_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -131,7 +131,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX689-NEXT: .LBB2_1: ; %loop
; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX689-NEXT: s_waitcnt lgkmcnt(0)
-; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX689-NEXT: s_add_i32 s2, s2, -1
; GFX689-NEXT: s_cmp_lg_u32 s2, 0
; GFX689-NEXT: s_cbranch_scc1 .LBB2_1
@@ -143,7 +143,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX7-NEXT: .LBB2_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0xf4240
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0xf4240 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB2_1
@@ -155,7 +155,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX12-NEXT: .LBB2_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -185,7 +185,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX689-NEXT: .LBB3_1: ; %loop
; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX689-NEXT: s_waitcnt lgkmcnt(0)
-; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX689-NEXT: s_add_i32 s2, s2, -1
; GFX689-NEXT: s_cmp_lg_u32 s2, 0
; GFX689-NEXT: s_cbranch_scc1 .LBB3_1
@@ -197,7 +197,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX7-NEXT: .LBB3_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB3_1
@@ -210,7 +210,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX12-NEXT: .LBB3_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -240,7 +240,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
; GFX6789-NEXT: .LBB4_1: ; %loop
; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6789-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX6789-NEXT: s_add_i32 s2, s2, -1
; GFX6789-NEXT: s_cmp_lg_u32 s2, 0
; GFX6789-NEXT: s_cbranch_scc1 .LBB4_1
@@ -256,7 +256,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
; GFX12-NEXT: .LBB4_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -286,7 +286,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX678-NEXT: .LBB5_1: ; %loop
; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
-; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX678-NEXT: s_add_i32 s2, s2, -1
; GFX678-NEXT: s_cmp_lg_u32 s2, 0
; GFX678-NEXT: s_cbranch_scc1 .LBB5_1
@@ -298,7 +298,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX9-NEXT: .LBB5_1: ; %loop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
+; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190 glc
; GFX9-NEXT: s_add_i32 s2, s2, -1
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
@@ -310,7 +310,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX12-NEXT: .LBB5_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index e25a4b301537f7..8ff6971a1d7da0 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -264,8 +264,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_lshl_b32 s1, s1, 16
@@ -279,8 +279,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -292,8 +292,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: ;;#ASMSTART
@@ -305,8 +305,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -548,8 +548,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_lshl_b32 s1, s1, 16
@@ -563,8 +563,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -576,8 +576,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: ;;#ASMSTART
@@ -589,8 +589,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index c744ace37a8315..e3d594f978f668 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -9,7 +9,7 @@ define i32 @s_add_co_select_user() {
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s6, s6
; GFX7-NEXT: s_or_b32 s4, s4, s5
@@ -28,7 +28,7 @@ define i32 @s_add_co_select_user() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -46,7 +46,7 @@ define i32 @s_add_co_select_user() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b64 s[4:5], 0
-; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 glc dlc
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, s5, s4, s4
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
@@ -63,7 +63,7 @@ define i32 @s_add_co_select_user() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee27981710..35bafd8d7df87c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -564,7 +564,7 @@ define <8 x i32> @v8i32_func_void() #0 {
; GFX789-LABEL: v8i32_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -576,7 +576,7 @@ define <8 x i32> @v8i32_func_void() #0 {
; GFX11-LABEL: v8i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -594,7 +594,7 @@ define <16 x i32> @v16i32_func_void() #0 {
; GFX789-LABEL: v16i32_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -608,7 +608,7 @@ define <16 x i32> @v16i32_func_void() #0 {
; GFX11-LABEL: v16i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,7 +628,7 @@ define <32 x i32> @v32i32_func_void() #0 {
; GFX789-LABEL: v32i32_func_void:
; GFX789: ; %b...
[truncated]
|
@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt | |||
let mayStore = 0; | |||
let mayLoad = 1; | |||
let hasSideEffects = 0; | |||
let maybeAtomic = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does removing this cause SIMemoryLegalizer to process scalar loads and stores?
Should the memory model in AMDGPUUsage be updated to include the scalar memory instructions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does removing this cause SIMemoryLegalizer to process scalar loads and stores?
Not setting maybeAtomic = 0
on SMEM Pseudos means that they all inherit the default of maybeAtomic = 1
. The main loop in SIMemoryLegalizer::runOnMachineFunction
ignores anything that is not marked as maybeAtomic
.
Should the memory model in AMDGPUUsage be updated to include the scalar memory instructions?
Probably. Are you saying that should be a prerequisite for this patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think if we change the implementation of the memory model we should ensure the ABI documentation is updated to match. I think only the non-atomic rows need to be updated to include the SMEM instructions since we do not support scalar atomics.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code changes make sense to me.
It does seem the memory does not explicitly cover SMEM at the moment.
Adding cache bypass bits certainly shouldn't make the code gen any less correct.
Not really a problem with this patch, but I cannot convince myself that RT_NT
is the right temporal hint on GFX12 for volatile -- I would have expected perhaps LU
. Seems some kind of scope might also be needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes sense to me from a memory model perspective.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Made a start on updating the memory model but there are lots of things I am unsure about.
@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. | |||
be reordered by | |||
hardware. | |||
|
|||
load *none* *none* - constant - !volatile & !nontemporal |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe this should say "constant address space and only if the compiler can prove that the address is uniform", but I'm not sure where to fit that into the table.
Should I mention scalar stores in the table? The text above says "Since the constant address space contents do not change during the execution of a kernel dispatch it is not legal to perform
stores".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIK we do not generate scalar stores as the instruction was removed from the isa.
I do not think you need to say "only if the address is uniform" as the compiler cannot generate scalar loads unless that condition is already met.
|
||
- volatile | ||
|
||
1. s_load/s_buffer_load glc=1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The VMEM case adds a waitcnt. Do we need to add s_waitcnt lgkmcnt(0)
here? That still wouldn't prevent reordering of two volatile accesses, one of which used global_load and one which used s_load.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need a waitcny lgkm(0) here for the same reason. It is waiting for the proceeding scalar load to complete before continuing. That ensures each volatile memory is complete before moving to the next one. There is no need to wait for VMEM as any previous VMEM will have been followed by its own waitcnt vmem(0).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I have documented that but I don't understand SIGfx6CacheControl::insertWait
well enough to implement it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the spec looks fine to me. But you must admit that the concept of a volatile constant load is a bit meaningless. Not sure how it could ever happen. By definition a volatile location cannot be constant. That may be why I left scalar out of the memory model description.
Presumably this cahnge needs to be done to the memory model specification for the other targets too.
@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. | |||
be reordered by | |||
hardware. | |||
|
|||
load *none* *none* - constant - !volatile & !nontemporal |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIK we do not generate scalar stores as the instruction was removed from the isa.
I do not think you need to say "only if the address is uniform" as the compiler cannot generate scalar loads unless that condition is already met.
|
||
- volatile | ||
|
||
1. s_load/s_buffer_load glc=1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need a waitcny lgkm(0) here for the same reason. It is waiting for the proceeding scalar load to complete before continuing. That ensures each volatile memory is complete before moving to the next one. There is no need to wait for VMEM as any previous VMEM will have been followed by its own waitcnt vmem(0).
No description provided.