Skip to content

Commit

Permalink
[AMDGPU] Enable divergence predicates for negative inline constant su…
Browse files Browse the repository at this point in the history
…btraction

We have a pattern that undo sub x, c -> add x, -c canonicalization since c is more likely
 an inline immediate than -c. This patch enables it to select scalar or vector subtracion by the input node divergence.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D121360
  • Loading branch information
alex-t committed Mar 10, 2022
1 parent af98b0a commit d159b44
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 60 deletions.
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -2764,18 +2764,18 @@ def : GCNPat <
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
>;

def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = HasAddNoCarryInsts;
}

def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = NotHasAddNoCarryInsts;
}
Expand Down
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
@@ -0,0 +1,28 @@
; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s

; FUNC-LABEL: {{^}}uniform_add_SIC:
; GCN: S_SUB_I32 killed %{{[0-9]+}}, 32
define amdgpu_kernel void @uniform_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%a = load i32, i32 addrspace(1)* %in
%result = add i32 %a, -32
store i32 %result, i32 addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}uniform_add_SIC:
; SI: V_SUB_CO_U32_e64 killed %{{[0-9]+}}, 32
; GFX900: V_SUB_U32_e64 killed %{{[0-9]+}}, 32
define amdgpu_kernel void @divergent_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
%a = load volatile i32, i32 addrspace(1)* %gep
%result = add i32 %a, -32
store i32 %result, i32 addrspace(1)* %out
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
114 changes: 57 additions & 57 deletions llvm/test/CodeGen/AMDGPU/shift-i128.ll
Expand Up @@ -306,43 +306,43 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12
; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; GCN-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12
; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
Expand All @@ -355,43 +355,43 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8
; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12
; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = lshr <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
Expand All @@ -404,45 +404,45 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7]
; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v8, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GCN-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v8, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
Expand Down

0 comments on commit d159b44

Please sign in to comment.