Skip to content

Commit 23db8e4

Browse files
committed
[AMDGPU] Use v_mad_u64_u32 for IMAD32
Nic Curtis done the experiments to prove it is faster than a separate mul and add. Fixes: SWDEV-332806 Differential Revision: https://reviews.llvm.org/D127253
1 parent 06aa6ec commit 23db8e4

11 files changed

+434
-114
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
410410

411411
} // End Predicates = [Has16BitInsts, isGFX10Plus]
412412

413-
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
413+
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
414414
(ops node:$x, node:$y, node:$z),
415415
// When the inner operation is used multiple times, selecting 3-op
416416
// instructions may still be beneficial -- if the other users can be
@@ -440,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
440440
return true;
441441
}]> {
442442
let PredicateCodeUsesOperands = 1;
443+
}
443444

445+
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
444446
// The divergence predicate is irrelevant in GlobalISel, as we have
445447
// proper register bank checks. We just need to verify the constant
446448
// bus restriction when all the sources are considered.
@@ -568,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
568570
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
569571
} // End SubtargetPredicate = isGFX9Plus
570572

573+
// FIXME: GlobalISel in general does not handle instructions with 2 results,
574+
// so it cannot use these patterns.
575+
multiclass IMAD32_Pats <VOP3_Pseudo inst> {
576+
def : GCNPat <
577+
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
578+
(EXTRACT_SUBREG (inst $src0, $src1,
579+
(REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
580+
$src2, sub0,
581+
(i32 (IMPLICIT_DEF)), sub1),
582+
0 /* clamp */),
583+
sub0)
584+
>;
585+
// Immediate src2 in the pattern above will not fold because it would be partially
586+
// undef. Hence define specialized pattern for this case.
587+
// FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
588+
// make it SDAG only.
589+
def : GCNPat <
590+
(ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
591+
(EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
592+
>;
593+
}
594+
595+
let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
596+
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
597+
let SubtargetPredicate = isGFX11Only in
598+
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
599+
571600
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
572601
let Src0RC64 = VRegSrc_32;
573602
let Src1RC64 = SCSrc_b32;

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,12 +268,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
268268
; GFX10W64-NEXT: .LBB1_2:
269269
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
270270
; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
271-
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
272-
; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
273271
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
274272
; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
273+
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
274+
; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
275275
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
276-
; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
277276
; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
278277
; GFX10W64-NEXT: s_endpgm
279278
;
@@ -298,12 +297,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
298297
; GFX10W32-NEXT: .LBB1_2:
299298
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
300299
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
301-
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
302-
; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
303300
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
304301
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
302+
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
303+
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
305304
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
306-
; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
307305
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
308306
; GFX10W32-NEXT: s_endpgm
309307
entry:

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
288288
; GFX1064-NEXT: .LBB1_2:
289289
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
290290
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
291-
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
292-
; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0
293291
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
292+
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
294293
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
295294
; GFX1064-NEXT: s_mov_b32 s6, -1
296-
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0
295+
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
297296
; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
298297
; GFX1064-NEXT: s_endpgm
299298
;
@@ -326,12 +325,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
326325
; GFX1032-NEXT: .LBB1_2:
327326
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
328327
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
329-
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
330-
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
331328
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
329+
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
332330
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
333331
; GFX1032-NEXT: s_mov_b32 s6, -1
334-
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0
332+
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
335333
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
336334
; GFX1032-NEXT: s_endpgm
337335
entry:
@@ -878,11 +876,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
878876
; GFX9-NEXT: v_mov_b32_e32 v0, s0
879877
; GFX9-NEXT: v_mov_b32_e32 v1, s1
880878
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
881-
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
882879
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
883880
; GFX9-NEXT: s_mov_b32 s7, 0xf000
884881
; GFX9-NEXT: s_mov_b32 s6, -1
885-
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
882+
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
886883
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
887884
; GFX9-NEXT: s_endpgm
888885
;
@@ -923,11 +920,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
923920
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
924921
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
925922
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
926-
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
927923
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
928924
; GFX1064-NEXT: s_mov_b32 s6, -1
929925
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
930-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
926+
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
931927
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
932928
; GFX1064-NEXT: s_endpgm
933929
;
@@ -967,11 +963,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
967963
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
968964
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
969965
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
970-
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
971966
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
972967
; GFX1032-NEXT: s_mov_b32 s6, -1
973968
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
974-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
969+
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2]
975970
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
976971
; GFX1032-NEXT: s_endpgm
977972
entry:
@@ -2000,16 +1995,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
20001995
; GFX9-NEXT: .LBB10_2:
20011996
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
20021997
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2003-
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
2004-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2005-
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2006-
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2007-
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
2008-
; GFX9-NEXT: v_mov_b32_e32 v3, s1
2009-
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
1998+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
20101999
; GFX9-NEXT: s_mov_b32 s7, 0xf000
20112000
; GFX9-NEXT: s_mov_b32 s6, -1
2012-
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
2001+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
2002+
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2003+
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2004+
; GFX9-NEXT: v_mov_b32_e32 v1, v4
2005+
; GFX9-NEXT: v_mov_b32_e32 v2, s1
2006+
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
2007+
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
20132008
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
20142009
; GFX9-NEXT: s_endpgm
20152010
;
@@ -2048,14 +2043,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
20482043
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
20492044
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
20502045
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2051-
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
2052-
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2053-
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
2054-
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
2046+
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
20552047
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
20562048
; GFX1064-NEXT: s_mov_b32 s6, -1
2057-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
2058-
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2
2049+
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
2050+
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
2051+
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
2052+
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
2053+
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
20592054
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
20602055
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
20612056
; GFX1064-NEXT: s_endpgm
@@ -2094,14 +2089,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
20942089
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
20952090
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
20962091
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2097-
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
2098-
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
2099-
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
2092+
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0
21002093
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
21012094
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
21022095
; GFX1032-NEXT: s_mov_b32 s6, -1
2103-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
2104-
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2
2096+
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5]
2097+
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
2098+
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
2099+
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
21052100
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
21062101
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
21072102
; GFX1032-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -293,12 +293,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
293293
; GFX1064-NEXT: .LBB1_2:
294294
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
295295
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
296-
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
297-
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
298296
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
299297
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
298+
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
299+
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
300300
; GFX1064-NEXT: s_mov_b32 s6, -1
301-
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0
302301
; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
303302
; GFX1064-NEXT: s_endpgm
304303
;
@@ -327,12 +326,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
327326
; GFX1032-NEXT: .LBB1_2:
328327
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
329328
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
330-
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
331-
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
332329
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
333330
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
334331
; GFX1032-NEXT: s_mov_b32 s6, -1
335-
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0
332+
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
333+
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
336334
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
337335
; GFX1032-NEXT: s_endpgm
338336
entry:
@@ -1012,13 +1010,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
10121010
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
10131011
; GFX9-NEXT: v_mov_b32_e32 v0, s4
10141012
; GFX9-NEXT: v_mov_b32_e32 v1, s5
1015-
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
1016-
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1013+
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
10171014
; GFX9-NEXT: s_mov_b32 s7, 0xf000
10181015
; GFX9-NEXT: s_mov_b32 s6, -1
1016+
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
10191017
; GFX9-NEXT: s_mov_b32 s4, s0
10201018
; GFX9-NEXT: s_mov_b32 s5, s1
1021-
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
10221019
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10231020
; GFX9-NEXT: s_endpgm
10241021
;
@@ -1053,11 +1050,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
10531050
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
10541051
; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
10551052
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1056-
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
1057-
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5]
1053+
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1054+
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
10581055
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
10591056
; GFX1064-NEXT: s_mov_b32 s2, -1
1060-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
10611057
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
10621058
; GFX1064-NEXT: s_endpgm
10631059
;
@@ -1091,11 +1087,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
10911087
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
10921088
; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
10931089
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1094-
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
1095-
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
10961090
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1091+
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1092+
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
10971093
; GFX1032-NEXT: s_mov_b32 s2, -1
1098-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
10991094
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11001095
; GFX1032-NEXT: s_endpgm
11011096
entry:
@@ -2176,18 +2171,18 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
21762171
; GFX9-NEXT: .LBB12_2:
21772172
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
21782173
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2179-
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
2180-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
2174+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
21812175
; GFX9-NEXT: s_mov_b32 s4, s0
21822176
; GFX9-NEXT: s_mov_b32 s5, s1
2177+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
21832178
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
21842179
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2185-
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
2186-
; GFX9-NEXT: v_mov_b32_e32 v3, s1
2187-
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
2180+
; GFX9-NEXT: v_mov_b32_e32 v1, v4
2181+
; GFX9-NEXT: v_mov_b32_e32 v2, s1
2182+
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
21882183
; GFX9-NEXT: s_mov_b32 s7, 0xf000
21892184
; GFX9-NEXT: s_mov_b32 s6, -1
2190-
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
2185+
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
21912186
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
21922187
; GFX9-NEXT: s_endpgm
21932188
;
@@ -2220,13 +2215,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
22202215
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
22212216
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
22222217
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2223-
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
2224-
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
2225-
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
2218+
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
22262219
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
2220+
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2221+
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
22272222
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
2228-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
2229-
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2
2223+
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
2224+
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
22302225
; GFX1064-NEXT: s_mov_b32 s2, -1
22312226
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
22322227
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2260,13 +2255,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
22602255
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
22612256
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
22622257
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2263-
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
2264-
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0
2265-
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
2258+
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
22662259
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
2260+
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
2261+
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
22672262
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
2268-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
2269-
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2
2263+
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
2264+
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
22702265
; GFX1032-NEXT: s_mov_b32 s2, -1
22712266
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
22722267
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0

llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
267267
; GFX10W64-NEXT: .LBB1_2:
268268
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
269269
; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
270-
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
271-
; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
272270
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
273271
; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
272+
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
273+
; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
274274
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
275-
; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
276275
; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
277276
; GFX10W64-NEXT: s_endpgm
278277
;
@@ -297,12 +296,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
297296
; GFX10W32-NEXT: .LBB1_2:
298297
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
299298
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
300-
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
301-
; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
302299
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
303300
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
301+
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
302+
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
304303
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
305-
; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
306304
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
307305
; GFX10W32-NEXT: s_endpgm
308306
entry:

0 commit comments

Comments
 (0)