-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/load-global-i32: regenerate test using UTC (NFC) #73962
Conversation
Fix the RUN lines so that UTC runs cleanly, strip hand-written CHECK lines, and regenerate the test load-global-i32.ll using utils/update_llc_test_checks.py.
@llvm/pr-subscribers-backend-amdgpu Author: Ramkumar Ramachandra (artagnon) ChangesFix the RUN lines so that UTC runs cleanly, strip hand-written CHECK lines, and regenerate the test load-global-i32.ll using utils/update_llc_test_checks.py. Patch is 226.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73962.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index c4d9b4b2bb5ebbb..55f0773f7e05aea 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,113 +1,825 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI-NOHSA -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
-; FUNC-LABEL: {{^}}global_load_i32:
-; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
-; GCN-HSA: {{flat|global}}_load_dword
-
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_i32:
+; SI-NOHSA: ; %bb.0: ; %entry
+; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT: s_mov_b32 s6, -1
+; SI-NOHSA-NEXT: s_mov_b32 s10, s6
+; SI-NOHSA-NEXT: s_mov_b32 s11, s7
+; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT: s_mov_b32 s8, s2
+; SI-NOHSA-NEXT: s_mov_b32 s9, s3
+; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT: s_mov_b32 s4, s0
+; SI-NOHSA-NEXT: s_mov_b32 s5, s1
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NOHSA-NEXT: s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_i32:
+; GCNX3-HSA: ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: flat_load_dword v2, v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT: flat_store_dword v[0:1], v2
+; GCNX3-HSA-NEXT: s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_i32:
+; GCNX3-NOHSA: ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: s_endpgm
+;
+; EG-LABEL: global_load_i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_i32:
+; GCN-HSA: ; %bb.0: ; %entry
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-HSA-NEXT: s_endpgm
entry:
%ld = load i32, ptr addrspace(1) %in
store i32 %ld, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}global_load_v2i32:
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: {{flat|global}}_load_dwordx2
-
-; EG: VTX_READ_64
define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v2i32:
+; SI-NOHSA: ; %bb.0: ; %entry
+; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT: s_mov_b32 s6, -1
+; SI-NOHSA-NEXT: s_mov_b32 s10, s6
+; SI-NOHSA-NEXT: s_mov_b32 s11, s7
+; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT: s_mov_b32 s8, s2
+; SI-NOHSA-NEXT: s_mov_b32 s9, s3
+; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NOHSA-NEXT: s_mov_b32 s4, s0
+; SI-NOHSA-NEXT: s_mov_b32 s5, s1
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT: s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v2i32:
+; GCNX3-HSA: ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT: s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v2i32:
+; GCNX3-NOHSA: ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: s_endpgm
+;
+; EG-LABEL: global_load_v2i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v2i32:
+; GCN-HSA: ; %bb.0: ; %entry
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-HSA-NEXT: s_endpgm
entry:
%ld = load <2 x i32>, ptr addrspace(1) %in
store <2 x i32> %ld, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}global_load_v3i32:
-; SI-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx3
-; GCNX3-HSA: {{flat|global}}_load_dwordx3
-
-; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v3i32:
+; SI-NOHSA: ; %bb.0: ; %entry
+; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT: s_mov_b32 s6, -1
+; SI-NOHSA-NEXT: s_mov_b32 s10, s6
+; SI-NOHSA-NEXT: s_mov_b32 s11, s7
+; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT: s_mov_b32 s8, s2
+; SI-NOHSA-NEXT: s_mov_b32 s9, s3
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT: s_mov_b32 s4, s0
+; SI-NOHSA-NEXT: s_mov_b32 s5, s1
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT: s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v3i32:
+; GCNX3-HSA: ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s1
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
+; GCNX3-HSA-NEXT: s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v3i32:
+; GCNX3-NOHSA: ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: s_endpgm
+;
+; EG-LABEL: global_load_v3i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T2.X, T0.Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v3i32:
+; GCN-HSA: ; %bb.0: ; %entry
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GCN-HSA-NEXT: s_endpgm
entry:
%ld = load <3 x i32>, ptr addrspace(1) %in
store <3 x i32> %ld, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}global_load_v4i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v4i32:
+; SI-NOHSA: ; %bb.0: ; %entry
+; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT: s_mov_b32 s6, -1
+; SI-NOHSA-NEXT: s_mov_b32 s10, s6
+; SI-NOHSA-NEXT: s_mov_b32 s11, s7
+; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT: s_mov_b32 s8, s2
+; SI-NOHSA-NEXT: s_mov_b32 s9, s3
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT: s_mov_b32 s4, s0
+; SI-NOHSA-NEXT: s_mov_b32 s5, s1
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT: s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v4i32:
+; GCNX3-HSA: ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT: s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v4i32:
+; GCNX3-NOHSA: ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: s_endpgm
+;
+; EG-LABEL: global_load_v4i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v4i32:
+; GCN-HSA: ; %bb.0: ; %entry
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-HSA-NEXT: s_endpgm
entry:
%ld = load <4 x i32>, ptr addrspace(1) %in
store <4 x i32> %ld, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}global_load_v8i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; EG: VTX_READ_128
-; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v8i32:
+; SI-NOHSA: ; %bb.0: ; %entry
+; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT: s_mov_b32 s6, -1
+; SI-NOHSA-NEXT: s_mov_b32 s10, s6
+; SI-NOHSA-NEXT: s_mov_b32 s11, s7
+; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT: s_mov_b32 s8, s2
+; SI-NOHSA-NEXT: s_mov_b32 s9, s3
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; SI-NOHSA-NEXT: s_mov_b32 s4, s0
+; SI-NOHSA-NEXT: s_mov_b32 s5, s1
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; SI-NOHSA-NEXT: s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v8i32:
+; GCNX3-HSA: ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
+; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
+; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GCNX3-HSA-NEXT: s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v8i32:
+; GCNX3-NOHSA: ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: s_endpgm
+;
+; EG-LABEL: global_load_v8i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v8i32:
+; GCN-HSA: ; %bb.0: ; %entry
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
+; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems reasonable to me.
Fix the RUN lines so that UTC runs cleanly, and regenerate the test load-global-i32.ll using utils/update_llc_test_checks.py.