Skip to content

Commit

Permalink
[Support] improve known bits analysis for multiply by power-of-2 (1 s…
Browse files Browse the repository at this point in the history
…et bit)

This can be viewed as recognizing that multiply-by-power-of-2 doesn't
have a carry into the top bit of an M-bit * N-bit number.

Enhancing canonicalization of mul -> select might also handle some of
these if we were ok with increasing instruction count with casts in
some cases.

This doesn't help https://llvm.org/PR49055 , but it's a simpler
pattern that we miss.
Note: "-sccp" already gets these examples using a constant
range analysis.

Differential Revision: https://reviews.llvm.org/D114962
  • Loading branch information
rotateright committed Dec 8, 2021
1 parent e1edec1 commit e9179a6
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 45 deletions.
9 changes: 8 additions & 1 deletion llvm/lib/Support/KnownBits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,9 +421,16 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
"Self multiplication knownbits mismatch");

// Compute a conservative estimate for high known-0 bits.
// TODO: This could be generalized to number of sign bits (negative numbers).
unsigned LHSLeadZ = LHS.countMinLeadingZeros();
unsigned RHSLeadZ = RHS.countMinLeadingZeros();
unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth;

// If either operand is a power-of-2, the multiply is only shifting bits in
// the other operand (there can't be a carry into the M+N bit of the result).
// Note: if we know that a value is entirely 0, that should simplify below.
bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1;

unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth;
assert(LeadZ <= BitWidth && "More zeros than bits?");

// The result of the bottom bits of an integer multiply can be
Expand Down
9 changes: 2 additions & 7 deletions llvm/test/CodeGen/AMDGPU/sdiv64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1528,7 +1528,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; GCN-NEXT: v_rcp_f32_e32 v3, v3
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GCN-NEXT: v_trunc_f32_e32 v4, v4
Expand Down Expand Up @@ -1578,18 +1577,14 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v5, 17, v4
; GCN-NEXT: v_lshlrev_b32_e32 v4, 15, v4
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v6, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v3, 17, v3
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v3
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
; GCN-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0x8000, v5
; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v0
; GCN-NEXT: v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5]
Expand Down
9 changes: 2 additions & 7 deletions llvm/test/CodeGen/AMDGPU/srem64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1701,7 +1701,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; GCN-NEXT: v_rcp_f32_e32 v2, v2
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GCN-NEXT: v_trunc_f32_e32 v3, v3
Expand Down Expand Up @@ -1751,18 +1750,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v4, 17, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2
; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0
; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
Expand Down
56 changes: 26 additions & 30 deletions llvm/test/Transforms/InstCombine/icmp-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -684,11 +684,7 @@ define i1 @oss_fuzz_39934(i32 %arg) {

define i1 @mul_of_bool(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_bool(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1
; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 false
;
%b = and i32 %x, 1
%z = zext i8 %y to i32
Expand All @@ -699,11 +695,7 @@ define i1 @mul_of_bool(i32 %x, i8 %y) {

define i1 @mul_of_bool_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_bool_commute(
; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1
; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X1]]
; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 false
;
%x1 = and i32 %x, 1
%y8 = and i32 %y, 255
Expand All @@ -714,11 +706,7 @@ define i1 @mul_of_bool_commute(i32 %x, i32 %y) {

define i1 @mul_of_bools(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_bools(
; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1
; CHECK-NEXT: [[Y1:%.*]] = and i32 [[Y:%.*]], 1
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X1]], [[Y1]]
; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[M]], 2
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 true
;
%x1 = and i32 %x, 1
%y1 = and i32 %y, 1
Expand All @@ -727,6 +715,8 @@ define i1 @mul_of_bools(i32 %x, i32 %y) {
ret i1 %r
}

; negative test - not a mask of low bit

define i1 @not_mul_of_bool(i32 %x, i8 %y) {
; CHECK-LABEL: @not_mul_of_bool(
; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 3
Expand All @@ -742,6 +732,8 @@ define i1 @not_mul_of_bool(i32 %x, i8 %y) {
ret i1 %r
}

; negative test - not a single low bit

define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @not_mul_of_bool_commute(
; CHECK-NEXT: [[X30:%.*]] = lshr i32 [[X:%.*]], 30
Expand All @@ -757,6 +749,9 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
ret i1 %r
}

; negative test - no leading zeros for 's'
; TODO: If analysis was generalized for sign bits, we could reduce this to false.

define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_bool_no_lz_other_op(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1
Expand All @@ -772,13 +767,11 @@ define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
ret i1 %r
}

; high and low bits are known 0

define i1 @mul_of_pow2(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_pow2(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2
; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 510
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 false
;
%b = and i32 %x, 2
%z = zext i8 %y to i32
Expand All @@ -787,13 +780,11 @@ define i1 @mul_of_pow2(i32 %x, i8 %y) {
ret i1 %r
}

; high and low bits are known 0

define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_pow2_commute(
; CHECK-NEXT: [[X4:%.*]] = and i32 [[X:%.*]], 4
; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X4]]
; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 1020
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 false
;
%x4 = and i32 %x, 4
%y8 = and i32 %y, 255
Expand All @@ -802,13 +793,11 @@ define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
ret i1 %r
}

; only bit 7 can be set by the multiply

define i32 @mul_of_pow2s(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_pow2s(
; CHECK-NEXT: [[X8:%.*]] = and i32 [[X:%.*]], 8
; CHECK-NEXT: [[Y16:%.*]] = and i32 [[Y:%.*]], 16
; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X8]], [[Y16]]
; CHECK-NEXT: [[BIT7:%.*]] = or i32 [[M]], 128
; CHECK-NEXT: ret i32 [[BIT7]]
; CHECK-NEXT: ret i32 128
;
%x8 = and i32 %x, 8
%y16 = and i32 %y, 16
Expand All @@ -817,6 +806,8 @@ define i32 @mul_of_pow2s(i32 %x, i32 %y) {
ret i32 %bit7
}

; negative test - 6 * 255 = 1530 (but constant range analysis can get this)

define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
; CHECK-LABEL: @not_mul_of_pow2(
; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 6
Expand All @@ -832,6 +823,8 @@ define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
ret i1 %r
}

; negative test - 12 * 255 = 3060 (but constant range analysis can get this)

define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @not_mul_of_pow2_commute(
; CHECK-NEXT: [[X30:%.*]] = and i32 [[X:%.*]], 12
Expand All @@ -847,6 +840,9 @@ define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
ret i1 %r
}

; negative test - no leading zeros for 's'
; TODO: If analysis was generalized for sign bits, we could reduce this to false.

define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_pow2_no_lz_other_op(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2
Expand Down

0 comments on commit e9179a6

Please sign in to comment.