[AMDGPU] Use v_mad_u64_u32 for IMAD32

Nic Curtis done the experiments to prove it is faster than a separate mul and add. Fixes: SWDEV-332806 Differential Revision: https://reviews.llvm.org/D127253
llvm · Jun 9, 2022 · 23db8e4 · 23db8e4
1 parent 06aa6ec
commit 23db8e4
Show file tree

Hide file tree

Showing 11 changed files with 434 additions and 114 deletions.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -410,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
 
 } // End Predicates = [Has16BitInsts, isGFX10Plus]
 
-class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
   (ops node:$x, node:$y, node:$z),
   // When the inner operation is used multiple times, selecting 3-op
   // instructions may still be beneficial -- if the other users can be
@@ -440,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
     return true;
   }]> {
   let PredicateCodeUsesOperands = 1;
+}
 
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
   // The divergence predicate is irrelevant in GlobalISel, as we have
   // proper register bank checks. We just need to verify the constant
   // bus restriction when all the sources are considered.
@@ -568,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
 def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
 
+// FIXME: GlobalISel in general does not handle instructions with 2 results,
+// so it cannot use these patterns.
+multiclass IMAD32_Pats <VOP3_Pseudo inst> {
+  def : GCNPat <
+        (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
+        (EXTRACT_SUBREG (inst $src0, $src1,
+                              (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
+                                            $src2, sub0,
+                                            (i32 (IMPLICIT_DEF)), sub1),
+                                            0 /* clamp */),
+                        sub0)
+        >;
+  // Immediate src2 in the pattern above will not fold because it would be partially
+  // undef. Hence define specialized pattern for this case.
+  // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
+  // make it SDAG only.
+  def : GCNPat <
+        (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+        (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+        >;
+}
+
+let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
+defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+let SubtargetPredicate = isGFX11Only in
+defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+
 def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
   let Src0RC64 = VRegSrc_32;
   let Src1RC64 = SCSrc_b32;

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -268,12 +268,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W64-NEXT:  .LBB1_2:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W64-NEXT:    s_endpgm
 ;
@@ -298,12 +297,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 entry:

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -288,12 +288,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -326,12 +325,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -878,11 +876,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -923,11 +920,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -967,11 +963,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2]
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -2000,16 +1995,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:  .LBB10_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2048,14 +2043,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v2
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -2094,14 +2089,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[2:3], s0, s2, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s0, s2, v2, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v2
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -293,12 +293,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -327,12 +326,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -1012,13 +1010,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1053,11 +1050,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5]
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -1091,11 +1087,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v1
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -2176,18 +2171,18 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX9-NEXT:  .LBB12_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2220,13 +2215,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v2
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2260,13 +2255,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[2:3], s2, s2, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v2
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -267,12 +267,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W64-NEXT:  .LBB1_2:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W64-NEXT:    s_endpgm
 ;
@@ -297,12 +296,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 entry: