Skip to content

Commit 04bb381

Browse files
committed
AMDGPU: Fix mfma agpr allocation failures with -O0
Previously we were getting lucky on cases that can use AV registers with the normal optimization pipeline. I do not understand what the check against getAddressableNumArchVGPRs was doing here. This logic needs to be consistent with getMaxNumVectorRegs, as that is what getReservedRegs to determine the AGPR budget. In the future we should directly check the minimum AGPR budget, and individual selection patterns need to know the minimum budget required for them.
1 parent 92d8313 commit 04bb381

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
8585
if (ST.hasGFX90AInsts()) {
8686
// FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
8787
// should be separated from availability of AGPRs
88-
if (MFMAVGPRForm ||
89-
(ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
90-
!mayUseAGPRs(F)))
88+
if (!mayUseAGPRs(F))
9189
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
9290
}
9391

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck %s
3+
; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=1 < %s | FileCheck %s
4+
5+
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)
6+
7+
define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
8+
; CHECK-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
9+
; CHECK: ; %bb.0: ; %bb
10+
; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5]
11+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
12+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
13+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14+
; CHECK-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x34
15+
; CHECK-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
16+
; CHECK-NEXT: s_nop 0
17+
; CHECK-NEXT: s_load_dword s2, s[2:3], 0x64
18+
; CHECK-NEXT: s_mov_b32 s3, 0x3ff
19+
; CHECK-NEXT: v_and_b32_e64 v1, v1, s3
20+
; CHECK-NEXT: s_mov_b32 s3, 6
21+
; CHECK-NEXT: v_lshlrev_b32_e64 v8, s3, v1
22+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
23+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48
24+
; CHECK-NEXT: s_waitcnt vmcnt(0)
25+
; CHECK-NEXT: v_mov_b32_e32 v1, v7
26+
; CHECK-NEXT: v_mov_b32_e32 v2, v6
27+
; CHECK-NEXT: v_mov_b32_e32 v3, v5
28+
; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
29+
; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:32
30+
; CHECK-NEXT: s_waitcnt vmcnt(0)
31+
; CHECK-NEXT: v_mov_b32_e32 v5, v13
32+
; CHECK-NEXT: v_mov_b32_e32 v6, v12
33+
; CHECK-NEXT: v_mov_b32_e32 v7, v11
34+
; CHECK-NEXT: v_mov_b32_e32 v24, v10
35+
; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:16
36+
; CHECK-NEXT: s_waitcnt vmcnt(0)
37+
; CHECK-NEXT: v_mov_b32_e32 v25, v13
38+
; CHECK-NEXT: v_mov_b32_e32 v26, v12
39+
; CHECK-NEXT: v_mov_b32_e32 v27, v11
40+
; CHECK-NEXT: v_mov_b32_e32 v28, v10
41+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v8, s[0:1]
42+
; CHECK-NEXT: s_waitcnt vmcnt(0)
43+
; CHECK-NEXT: v_mov_b32_e32 v29, v11
44+
; CHECK-NEXT: v_mov_b32_e32 v30, v10
45+
; CHECK-NEXT: v_mov_b32_e32 v31, v9
46+
; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
47+
; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 killed $exec
48+
; CHECK-NEXT: v_mov_b32_e32 v9, v31
49+
; CHECK-NEXT: v_mov_b32_e32 v10, v30
50+
; CHECK-NEXT: v_mov_b32_e32 v11, v29
51+
; CHECK-NEXT: v_mov_b32_e32 v12, v28
52+
; CHECK-NEXT: v_mov_b32_e32 v13, v27
53+
; CHECK-NEXT: v_mov_b32_e32 v14, v26
54+
; CHECK-NEXT: v_mov_b32_e32 v15, v25
55+
; CHECK-NEXT: v_mov_b32_e32 v16, v24
56+
; CHECK-NEXT: v_mov_b32_e32 v17, v7
57+
; CHECK-NEXT: v_mov_b32_e32 v18, v6
58+
; CHECK-NEXT: v_mov_b32_e32 v19, v5
59+
; CHECK-NEXT: v_mov_b32_e32 v20, v4
60+
; CHECK-NEXT: v_mov_b32_e32 v21, v3
61+
; CHECK-NEXT: v_mov_b32_e32 v22, v2
62+
; CHECK-NEXT: v_mov_b32_e32 v23, v1
63+
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
64+
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
65+
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
66+
; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[8:9]
67+
; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[6:7]
68+
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[4:5]
69+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
70+
; CHECK-NEXT: s_nop 1
71+
; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 v[8:23], v[2:5], v[24:31], v1 cbsz:1 abid:2
72+
; CHECK-NEXT: s_nop 11
73+
; CHECK-NEXT: v_mov_b32_e32 v1, v23
74+
; CHECK-NEXT: v_mov_b32_e32 v6, v22
75+
; CHECK-NEXT: v_mov_b32_e32 v7, v21
76+
; CHECK-NEXT: v_mov_b32_e32 v2, v20
77+
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
78+
; CHECK-NEXT: v_mov_b32_e32 v3, v7
79+
; CHECK-NEXT: v_mov_b32_e32 v4, v6
80+
; CHECK-NEXT: v_mov_b32_e32 v5, v1
81+
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
82+
; CHECK-NEXT: v_mov_b32_e32 v1, v19
83+
; CHECK-NEXT: v_mov_b32_e32 v6, v18
84+
; CHECK-NEXT: v_mov_b32_e32 v7, v17
85+
; CHECK-NEXT: v_mov_b32_e32 v2, v16
86+
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
87+
; CHECK-NEXT: v_mov_b32_e32 v3, v7
88+
; CHECK-NEXT: v_mov_b32_e32 v4, v6
89+
; CHECK-NEXT: v_mov_b32_e32 v5, v1
90+
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
91+
; CHECK-NEXT: v_mov_b32_e32 v1, v15
92+
; CHECK-NEXT: v_mov_b32_e32 v6, v14
93+
; CHECK-NEXT: v_mov_b32_e32 v7, v13
94+
; CHECK-NEXT: v_mov_b32_e32 v2, v12
95+
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
96+
; CHECK-NEXT: v_mov_b32_e32 v3, v7
97+
; CHECK-NEXT: v_mov_b32_e32 v4, v6
98+
; CHECK-NEXT: v_mov_b32_e32 v5, v1
99+
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
100+
; CHECK-NEXT: v_mov_b32_e32 v1, v11
101+
; CHECK-NEXT: v_mov_b32_e32 v6, v10
102+
; CHECK-NEXT: v_mov_b32_e32 v7, v9
103+
; CHECK-NEXT: v_mov_b32_e32 v2, v8
104+
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
105+
; CHECK-NEXT: v_mov_b32_e32 v3, v7
106+
; CHECK-NEXT: v_mov_b32_e32 v4, v6
107+
; CHECK-NEXT: v_mov_b32_e32 v5, v1
108+
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
109+
; CHECK-NEXT: s_endpgm
110+
bb:
111+
%id = call i32 @llvm.amdgcn.workitem.id.x()
112+
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
113+
%in.1 = load <16 x float>, ptr addrspace(1) %gep
114+
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
115+
store <16 x float> %mai.1, ptr addrspace(1) %arg
116+
ret void
117+
}
118+
119+
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }

0 commit comments

Comments
 (0)