Skip to content

Commit

Permalink
[PowerPC] avoid masking already-zero bits in BitPermutationSelector
Browse files Browse the repository at this point in the history
The current BitPermutationSelector generates a code to build a value by tracking two types of bits: ConstZero and Variable.
ConstZero means a bit we need to mask off and Variable is a bit we copy from an input value.

This patch add third type of bits VariableKnownToBeZero caused by AssertZext node or zero-extending load node.
VariableKnownToBeZero means a bit comes from an input value, but it is known to be already zero. So we do not need to mask them.
VariableKnownToBeZero enhances flexibility to group bits, since we can avoid redundant masking for these bits.

This patch also renames "HasZero" to "NeedMask" since now we may skip masking even when we have zeros (of type VariableKnownToBeZero).

Differential Revision: https://reviews.llvm.org/D48025

llvm-svn: 344347
  • Loading branch information
inouehrs committed Oct 12, 2018
1 parent 6cbb3ca commit 9552dd1
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 28 deletions.
119 changes: 104 additions & 15 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Expand Up @@ -1083,9 +1083,14 @@ class BitPermutationSelector {
// lowest-order bit.
unsigned Idx;

// ConstZero means a bit we need to mask off.
// Variable is a bit comes from an input variable.
// VariableKnownToBeZero is also a bit comes from an input variable,
// but it is known to be already zero. So we do not need to mask them.
enum Kind {
ConstZero,
Variable
Variable,
VariableKnownToBeZero
} K;

ValueBit(SDValue V, unsigned I, Kind K = Variable)
Expand All @@ -1094,11 +1099,11 @@ class BitPermutationSelector {
: V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}

bool isZero() const {
return K == ConstZero;
return K == ConstZero || K == VariableKnownToBeZero;
}

bool hasValue() const {
return K == Variable;
return K == Variable || K == VariableKnownToBeZero;
}

SDValue getValue() const {
Expand Down Expand Up @@ -1248,8 +1253,14 @@ class BitPermutationSelector {
for (unsigned i = 0; i < NumBits; ++i)
if (((Mask >> i) & 1) == 1)
Bits[i] = (*LHSBits)[i];
else
Bits[i] = ValueBit(ValueBit::ConstZero);
else {
// AND instruction masks this bit. If the input is already zero,
// we have nothing to do here. Otherwise, make the bit ConstZero.
if ((*LHSBits)[i].isZero())
Bits[i] = (*LHSBits)[i];
else
Bits[i] = ValueBit(ValueBit::ConstZero);
}

return std::make_pair(Interesting, &Bits);
}
Expand All @@ -1259,15 +1270,43 @@ class BitPermutationSelector {
const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;

bool AllDisjoint = true;
for (unsigned i = 0; i < NumBits; ++i)
if (LHSBits[i].isZero())
SDValue LastVal = SDValue();
unsigned LastIdx = 0;
for (unsigned i = 0; i < NumBits; ++i) {
if (LHSBits[i].isZero() && RHSBits[i].isZero()) {
// If both inputs are known to be zero and one is ConstZero and
// another is VariableKnownToBeZero, we can select whichever
// we like. To minimize the number of bit groups, we select
// VariableKnownToBeZero if this bit is the next bit of the same
// input variable from the previous bit. Otherwise, we select
// ConstZero.
if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal &&
LHSBits[i].getValueBitIndex() == LastIdx + 1)
Bits[i] = LHSBits[i];
else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal &&
RHSBits[i].getValueBitIndex() == LastIdx + 1)
Bits[i] = RHSBits[i];
else
Bits[i] = ValueBit(ValueBit::ConstZero);
}
else if (LHSBits[i].isZero())
Bits[i] = RHSBits[i];
else if (RHSBits[i].isZero())
Bits[i] = LHSBits[i];
else {
AllDisjoint = false;
break;
}
// We remember the value and bit index of this bit.
if (Bits[i].hasValue()) {
LastVal = Bits[i].getValue();
LastIdx = Bits[i].getValueBitIndex();
}
else {
if (LastVal) LastVal = SDValue();
LastIdx = 0;
}
}

if (!AllDisjoint)
break;
Expand All @@ -1293,6 +1332,44 @@ class BitPermutationSelector {

return std::make_pair(Interesting, &Bits);
}
case ISD::AssertZext: {
// For AssertZext, we look through the operand and
// mark the bits known to be zero.
const SmallVector<ValueBit, 64> *LHSBits;
std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
NumBits);

EVT FromType = cast<VTSDNode>(V.getOperand(1))->getVT();
const unsigned NumValidBits = FromType.getSizeInBits();
for (unsigned i = 0; i < NumValidBits; ++i)
Bits[i] = (*LHSBits)[i];

// These bits are known to be zero.
for (unsigned i = NumValidBits; i < NumBits; ++i)
Bits[i] = ValueBit((*LHSBits)[i].getValue(),
(*LHSBits)[i].getValueBitIndex(),
ValueBit::VariableKnownToBeZero);

return std::make_pair(Interesting, &Bits);
}
case ISD::LOAD:
LoadSDNode *LD = cast<LoadSDNode>(V);
if (ISD::isZEXTLoad(V.getNode()) && V.getResNo() == 0) {
EVT VT = LD->getMemoryVT();
const unsigned NumValidBits = VT.getSizeInBits();

for (unsigned i = 0; i < NumValidBits; ++i)
Bits[i] = ValueBit(V, i);

// These bits are known to be zero.
for (unsigned i = NumValidBits; i < NumBits; ++i)
Bits[i] = ValueBit(V, i, ValueBit::VariableKnownToBeZero);

// Zero-extending load itself cannot be optimized. So, it is not
// interesting by itself though it gives useful information.
return std::make_pair(Interesting = false, &Bits);
}
break;
}

for (unsigned i = 0; i < NumBits; ++i)
Expand All @@ -1304,7 +1381,7 @@ class BitPermutationSelector {
// For each value (except the constant ones), compute the left-rotate amount
// to get it from its original to final position.
void computeRotationAmounts() {
HasZeros = false;
NeedMask = false;
RLAmt.resize(Bits.size());
for (unsigned i = 0; i < Bits.size(); ++i)
if (Bits[i].hasValue()) {
Expand All @@ -1314,7 +1391,7 @@ class BitPermutationSelector {
else
RLAmt[i] = Bits.size() - (VBI - i);
} else if (Bits[i].isZero()) {
HasZeros = true;
NeedMask = true;
RLAmt[i] = UINT32_MAX;
} else {
llvm_unreachable("Unknown value bit type");
Expand All @@ -1330,6 +1407,7 @@ class BitPermutationSelector {
unsigned LastRLAmt = RLAmt[0];
SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
unsigned LastGroupStartIdx = 0;
bool IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
for (unsigned i = 1; i < Bits.size(); ++i) {
unsigned ThisRLAmt = RLAmt[i];
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
Expand All @@ -1342,17 +1420,28 @@ class BitPermutationSelector {
LastGroupStartIdx = 0;
}

// If this bit is known to be zero and the current group is a bit group
// of zeros, we do not need to terminate the current bit group even the
// Value or RLAmt does not match here. Instead, we terminate this group
// when the first non-zero bit appears later.
if (IsGroupOfZeros && Bits[i].isZero())
continue;

// If this bit has the same underlying value and the same rotate factor as
// the last one, then they're part of the same group.
if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
continue;
// We cannot continue the current group if this bits is not known to
// be zero in a bit group of zeros.
if (!(IsGroupOfZeros && ThisValue && !Bits[i].isZero()))
continue;

if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
i-1));
LastRLAmt = ThisRLAmt;
LastValue = ThisValue;
LastGroupStartIdx = i;
IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
}
if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
Expand Down Expand Up @@ -1698,7 +1787,7 @@ class BitPermutationSelector {
// If we've not yet selected a 'starting' instruction, and we have no zeros
// to fill in, select the (Value, RLAmt) with the highest priority (largest
// number of groups), and start with this rotated value.
if ((!HasZeros || LateMask) && !Res) {
if ((!NeedMask || LateMask) && !Res) {
ValueRotInfo &VRI = ValueRotsVec[0];
if (VRI.RLAmt) {
if (InstCnt) *InstCnt += 1;
Expand Down Expand Up @@ -2077,7 +2166,7 @@ class BitPermutationSelector {
// If we've not yet selected a 'starting' instruction, and we have no zeros
// to fill in, select the (Value, RLAmt) with the highest priority (largest
// number of groups), and start with this rotated value.
if ((!HasZeros || LateMask) && !Res) {
if ((!NeedMask || LateMask) && !Res) {
// If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
// groups will come first, and so the VRI representing the largest number
// of groups might not be first (it might be the first Repl32 groups).
Expand Down Expand Up @@ -2230,7 +2319,7 @@ class BitPermutationSelector {

SmallVector<ValueBit, 64> Bits;

bool HasZeros;
bool NeedMask;
SmallVector<unsigned, 64> RLAmt;

SmallVector<BitGroup, 16> BitGroups;
Expand Down Expand Up @@ -2259,10 +2348,10 @@ class BitPermutationSelector {
" selection for: ");
LLVM_DEBUG(N->dump(CurDAG));

// Fill it RLAmt and set HasZeros.
// Fill it RLAmt and set NeedMask.
computeRotationAmounts();

if (!HasZeros)
if (!NeedMask)
return Select(N, false);

// We currently have two techniques for handling results with zeros: early
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/PowerPC/addi-offset-fold.ll
Expand Up @@ -27,10 +27,9 @@ entry:
; FIXME: We don't need to do these stores at all.
; CHECK-DAG: std 3, -24(1)
; CHECK-DAG: stb 4, -16(1)
; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32
; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
; CHECK: rldicl 3, [[REG4]], 33, 57
; CHECK-DAG: rlwinm 3, [[REG2]], 1, 31, 31
; CHECK: rlwimi 3, 4, 1, 25, 30
; CHECK: blr
}

Expand Down
35 changes: 32 additions & 3 deletions llvm/test/CodeGen/PowerPC/bitfieldinsert.ll
@@ -1,6 +1,35 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s

; equivalent C code
; struct s64 {
; int a:5;
; int b:16;
; long c:42;
; };
; void bitfieldinsert64(struct s *p, unsigned short v) {
; p->b = v;
; }

%struct.s64 = type { i64 }

define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) {
; CHECK-LABEL: @bitfieldinsert64
; CHECK: ld [[REG1:[0-9]+]], 0(3)
; CHECK-NEXT: rlwimi [[REG1]], 4, 5, 11, 26
; CHECK-NEXT: std [[REG1]], 0(3)
; CHECK-NEXT: blr
entry:
%0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0
%1 = zext i16 %v to i64
%bf.load = load i64, i64* %0, align 8
%bf.shl = shl nuw nsw i64 %1, 5
%bf.clear = and i64 %bf.load, -2097121
%bf.set = or i64 %bf.clear, %bf.shl
store i64 %bf.set, i64* %0, align 8
ret void
}

; bitfieldinsert32: Test for rlwimi
; equivalent C code
; struct s32 {
Expand All @@ -17,9 +46,9 @@
define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) {
; CHECK-LABEL: @bitfieldinsert32
; CHECK: lwz [[REG1:[0-9]+]], 0(3)
; CHECK: rlwimi [[REG1]], 4, 8, 8, 23
; CHECK: stw [[REG1]], 0(3)
; CHECK: blr
; CHECK-NEXT: rlwimi [[REG1]], 4, 8, 8, 23
; CHECK-NEXT: stw [[REG1]], 0(3)
; CHECK-NEXT: blr
entry:
%0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0
%bf.load = load i32, i32* %0, align 4
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/PowerPC/ppc64le-aggregates.ll
Expand Up @@ -236,14 +236,12 @@ entry:
; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
; CHECK-DAG: lwz 9, [[OFF0]](1)
; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
; CHECK-DAG: lwz 10, [[OFF2]](1)
; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
; CHECK-DAG: or 9, [[REG0]], [[REG1]]
; CHECK-DAG: or 10, [[REG2]], [[REG3]]
; CHECK-DAG: rldimi 9, [[REG1]], 32, 0
; CHECK-DAG: rldimi 10, [[REG3]], 32, 0
; CHECK: bl test1

declare void @test1([8 x float], [8 x float])
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
Expand Up @@ -39,7 +39,7 @@ next:
ret i32 %conv174

; CHECK-LABEL: @test2
; CHECK: slwi 3, {{[0-9]+}}, 7
; CHECK: rlwinm 3, {{[0-9]+}}, 7, 17, 24
; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
; CHECK: blr
}
Expand Down

0 comments on commit 9552dd1

Please sign in to comment.