Skip to content

Commit

Permalink
[AMDGPU] Use v_mad_u64_u32 for IMAD32
Browse files Browse the repository at this point in the history
Nic Curtis done the experiments to prove it is faster than a
separate mul and add.

Fixes: SWDEV-332806

Differential Revision: https://reviews.llvm.org/D127253
  • Loading branch information
rampitec committed Jun 9, 2022
1 parent 06aa6ec commit 23db8e4
Show file tree
Hide file tree
Showing 11 changed files with 434 additions and 114 deletions.
31 changes: 30 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP3Instructions.td
Expand Up @@ -410,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;

} // End Predicates = [Has16BitInsts, isGFX10Plus]

class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
(ops node:$x, node:$y, node:$z),
// When the inner operation is used multiple times, selecting 3-op
// instructions may still be beneficial -- if the other users can be
Expand Down Expand Up @@ -440,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
return true;
}]> {
let PredicateCodeUsesOperands = 1;
}

class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
// The divergence predicate is irrelevant in GlobalISel, as we have
// proper register bank checks. We just need to verify the constant
// bus restriction when all the sources are considered.
Expand Down Expand Up @@ -568,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
} // End SubtargetPredicate = isGFX9Plus

// FIXME: GlobalISel in general does not handle instructions with 2 results,
// so it cannot use these patterns.
multiclass IMAD32_Pats <VOP3_Pseudo inst> {
def : GCNPat <
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
(EXTRACT_SUBREG (inst $src0, $src1,
(REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
$src2, sub0,
(i32 (IMPLICIT_DEF)), sub1),
0 /* clamp */),
sub0)
>;
// Immediate src2 in the pattern above will not fold because it would be partially
// undef. Hence define specialized pattern for this case.
// FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
// make it SDAG only.
def : GCNPat <
(ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
(EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
>;
}

let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
let SubtargetPredicate = isGFX11Only in
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;

def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
let Src0RC64 = VRegSrc_32;
let Src1RC64 = SCSrc_b32;
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
Expand Up @@ -268,12 +268,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
Expand All @@ -298,12 +297,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
entry:
Expand Down
57 changes: 26 additions & 31 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Expand Up @@ -288,12 +288,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -326,12 +325,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -878,11 +876,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -923,11 +920,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -967,11 +963,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2]
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -2000,16 +1995,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX9-NEXT: .LBB10_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -2048,14 +2043,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
Expand Down Expand Up @@ -2094,14 +2089,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5]
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
Expand Down
57 changes: 26 additions & 31 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Expand Up @@ -293,12 +293,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -327,12 +326,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -1012,13 +1010,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1053,11 +1050,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5]
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1091,11 +1087,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -2176,18 +2171,18 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX9-NEXT: .LBB12_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -2220,13 +2215,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Expand Down Expand Up @@ -2260,13 +2255,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
Expand Up @@ -267,12 +267,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
Expand All @@ -297,12 +296,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
entry:
Expand Down

0 comments on commit 23db8e4

Please sign in to comment.