Skip to content

Commit

Permalink
[DAG] Fold vector mul(x,0)/mul(x,1) to a clearing mask
Browse files Browse the repository at this point in the history
If we're multiplying all elements of a vector by '0' or '1' then we can more efficiently perform this as a clearing mask (that is likely to further simplify to a shuffle blend).

This was noticed when reviewing D87502 but seems to help idiv/irem by constant cases even more as '0'/'1' values are often used for 'passthrough' cases.

Differential Revision: https://reviews.llvm.org/D88225
  • Loading branch information
RKSimon committed Sep 26, 2020
1 parent decc194 commit a61272a
Show file tree
Hide file tree
Showing 8 changed files with 649 additions and 804 deletions.
32 changes: 32 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
Expand Down Expand Up @@ -3704,6 +3705,37 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
return DAG.getVScale(SDLoc(N), VT, C0 * C1);
}

// Fold ((mul x, 0/undef) -> 0,
// (mul x, 1) -> x) -> x)
// -> and(x, mask)
// We can replace vectors with '0' and '1' factors with a clearing mask.
if (VT.isFixedLengthVector()) {
unsigned NumElts = VT.getVectorNumElements();
SmallBitVector ClearMask;
ClearMask.reserve(NumElts);
auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
if (!V || V->isNullValue()) {
ClearMask.push_back(true);
return true;
}
ClearMask.push_back(false);
return V->isOne();
};
if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
SDLoc DL(N);
EVT LegalSVT = N1.getOperand(0).getValueType();
SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
SmallVector<SDValue, 16> Mask(NumElts, AllOnes);
for (unsigned I = 0; I != NumElts; ++I)
if (ClearMask[I])
Mask[I] = Zero;
return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask));
}
}

// reassociate mul
if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
return RMUL;
Expand Down
18 changes: 12 additions & 6 deletions llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2]
; CHECK-NEXT: adrp x8, .LCPI0_3
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down Expand Up @@ -227,7 +228,8 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2]
; CHECK-NEXT: adrp x8, .LCPI7_3
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down Expand Up @@ -282,7 +284,8 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2]
; CHECK-NEXT: adrp x8, .LCPI9_3
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down Expand Up @@ -371,7 +374,8 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3]
; CHECK-NEXT: adrp x8, .LCPI12_4
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down Expand Up @@ -696,7 +700,8 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3]
; CHECK-NEXT: adrp x8, .LCPI22_4
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down Expand Up @@ -762,7 +767,8 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3]
; CHECK-NEXT: adrp x8, .LCPI24_4
; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4]
; CHECK-NEXT: neg v3.4s, v3.4s
; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
Expand Down
Loading

0 comments on commit a61272a

Please sign in to comment.