480 changes: 480 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace {
SDNode *Select(SDNode *N) override;

SDNode *SelectBitfieldInsert(SDNode *N);
SDNode *SelectBitPermutation(SDNode *N);

/// SelectCC - Select a comparison of the specified values with the
/// specified condition code, returning the CR# of the expression.
Expand Down Expand Up @@ -532,6 +533,481 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
return nullptr;
}

namespace {
class BitPermutationSelector {
struct ValueBit {
SDValue V;

// The bit number in the value, using a convention where bit 0 is the
// lowest-order bit.
unsigned Idx;

enum Kind {
ConstZero,
Variable
} K;

ValueBit(SDValue V, unsigned I, Kind K = Variable)
: V(V), Idx(I), K(K) {}
ValueBit(Kind K = Variable)
: V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}

bool isZero() const {
return K == ConstZero;
}

bool hasValue() const {
return K == Variable;
}

SDValue getValue() const {
assert(hasValue() && "Cannot get the value of a constant bit");
return V;
}

unsigned getValueBitIndex() const {
assert(hasValue() && "Cannot get the value bit index of a constant bit");
return Idx;
}
};

// A bit group has the same underlying value and the same rotate factor.
struct BitGroup {
SDValue V;
unsigned RLAmt;
unsigned StartIdx, EndIdx;

BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
: V(V), RLAmt(R), StartIdx(S), EndIdx(E) {
DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
" [" << S << ", " << E << "]\n");
}
};

// Information on each (Value, RLAmt) pair (like the number of groups
// associated with each) used to choose the lowering method.
struct ValueRotInfo {
SDValue V;
unsigned RLAmt;
unsigned NumGroups;
unsigned FirstGroupStartIdx;

ValueRotInfo()
: RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX) {}

// For sorting (in reverse order) by NumGroups, and then by
// FirstGroupStartIdx.
bool operator < (const ValueRotInfo &Other) const {
if (NumGroups > Other.NumGroups)
return true;
else if (NumGroups < Other.NumGroups)
return false;
else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
return true;
return false;
}
};

// Return true if something interesting was deduced, return false if we're
// providing only a generic representation of V (or something else likewise
// uninteresting for instruction selection).
bool getValueBits(SDValue V, SmallVector<ValueBit, 64> &Bits) {
switch (V.getOpcode()) {
default: break;
case ISD::ROTL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned RotAmt = V.getConstantOperandVal(1);

SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);

for (unsigned i = 0; i < Bits.size(); ++i)
Bits[i] = LHSBits[i < RotAmt ? i + (Bits.size() - RotAmt) : i - RotAmt];

return true;
}
break;
case ISD::SHL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);

SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);

for (unsigned i = ShiftAmt; i < Bits.size(); ++i)
Bits[i] = LHSBits[i - ShiftAmt];

for (unsigned i = 0; i < ShiftAmt; ++i)
Bits[i] = ValueBit(ValueBit::ConstZero);

return true;
}
break;
case ISD::SRL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);

SmallVector<ValueBit, 64> LHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);

for (unsigned i = 0; i < Bits.size() - ShiftAmt; ++i)
Bits[i] = LHSBits[i + ShiftAmt];

for (unsigned i = Bits.size() - ShiftAmt; i < Bits.size(); ++i)
Bits[i] = ValueBit(ValueBit::ConstZero);

return true;
}
break;
case ISD::AND:
if (isa<ConstantSDNode>(V.getOperand(1))) {
uint64_t Mask = V.getConstantOperandVal(1);

SmallVector<ValueBit, 64> LHSBits(Bits.size());
bool LHSTrivial = getValueBits(V.getOperand(0), LHSBits);

for (unsigned i = 0; i < Bits.size(); ++i)
if (((Mask >> i) & 1) == 1)
Bits[i] = LHSBits[i];
else
Bits[i] = ValueBit(ValueBit::ConstZero);

// Mark this as interesting, only if the LHS was also interesting. This
// prevents the overall procedure from matching a single immediate 'and'
// (which is non-optimal because such an and might be folded with other
// things if we don't select it here).
return LHSTrivial;
}
break;
case ISD::OR: {
SmallVector<ValueBit, 64> LHSBits(Bits.size()), RHSBits(Bits.size());
getValueBits(V.getOperand(0), LHSBits);
getValueBits(V.getOperand(1), RHSBits);

bool AllDisjoint = true;
for (unsigned i = 0; i < Bits.size(); ++i)
if (LHSBits[i].isZero())
Bits[i] = RHSBits[i];
else if (RHSBits[i].isZero())
Bits[i] = LHSBits[i];
else {
AllDisjoint = false;
break;
}

if (!AllDisjoint)
break;

return true;
}
}

for (unsigned i = 0; i < Bits.size(); ++i)
Bits[i] = ValueBit(V, i);

return false;
}

// For each value (except the constant ones), compute the left-rotate amount
// to get it from its original to final position.
void computeRotationAmounts() {
HasZeros = false;
RLAmt.resize(Bits.size());
for (unsigned i = 0; i < Bits.size(); ++i)
if (Bits[i].hasValue()) {
unsigned VBI = Bits[i].getValueBitIndex();
if (i >= VBI)
RLAmt[i] = i - VBI;
else
RLAmt[i] = Bits.size() - (VBI - i);
} else if (Bits[i].isZero()) {
HasZeros = true;
RLAmt[i] = UINT32_MAX;
} else {
llvm_unreachable("Unknown value bit type");
}
}

// Collect groups of consecutive bits with the same underlying value and
// rotation factor.
void collectBitGroups() {
BitGroups.clear();

unsigned LastRLAmt = RLAmt[0];
SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
unsigned LastGroupStartIdx = 0;
for (unsigned i = 1; i < Bits.size(); ++i) {
unsigned ThisRLAmt = RLAmt[i];
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();

// If this bit has the same underlying value and the same rotate factor as
// the last one, then they're part of the same group.
if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
continue;

if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
i-1));
LastRLAmt = ThisRLAmt;
LastValue = ThisValue;
LastGroupStartIdx = i;
}
if (LastValue.getNode())
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
Bits.size()-1));

if (BitGroups.empty())
return;

// We might be able to combine the first and last groups.
if (BitGroups.size() > 1) {
// If the first and last groups are the same, then remove the first group
// in favor of the last group, making the ending index of the last group
// equal to the ending index of the to-be-removed first group.
if (BitGroups[0].StartIdx == 0 &&
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
BitGroups.erase(BitGroups.begin());
}
}
}

// Take all (SDValue, RLAmt) pairs and sort them by the number of groups
// associated with each. If there is a degeneracy, pick the one that occurs
// first (in the final value).
void collectValueRotInfo() {
ValueRots.clear();

for (auto &BG : BitGroups) {
ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, BG.RLAmt)];
VRI.V = BG.V;
VRI.RLAmt = BG.RLAmt;
VRI.NumGroups += 1;
VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
}

// Now that we've collected the various ValueRotInfo instances, we need to
// sort them.
ValueRotsVec.clear();
for (auto &I : ValueRots) {
ValueRotsVec.push_back(I.second);
}
std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
}

SDValue getI32Imm(unsigned Imm) {
return CurDAG->getTargetConstant(Imm, MVT::i32);
}

// Depending on the number of groups for a particular value, it might be
// better to rotate, mask explicitly (using andi/andis), and then or the
// result. Select this part of the result first.
void SelectAndParts32(SDNode *N, SDValue &Res) {
SDLoc dl(N);

for (ValueRotInfo &VRI : ValueRotsVec) {
unsigned Mask = 0;
for (unsigned i = 0; i < Bits.size(); ++i) {
if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
continue;
if (RLAmt[i] != VRI.RLAmt)
continue;
Mask |= (1u << i);
}

// Compute the masks for andi/andis that would be necessary.
unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
assert((ANDIMask != 0 || ANDISMask != 0) &&
"No set bits in mask for value bit groups");
bool NeedsRotate = VRI.RLAmt != 0;

// We're trying to minimize the number of instructions. If we have one
// group, using one of andi/andis can break even. If we have three
// groups, we can use both andi and andis and break even (to use both
// andi and andis we also need to or the results together). We need four
// groups if we also need to rotate. To use andi/andis we need to do more
// than break even because rotate-and-mask instructions tend to be easier
// to schedule.

// FIXME: We've biased here against using andi/andis, which is right for
// POWER cores, but not optimal everywhere. For example, on the A2,
// andi/andis have single-cycle latency whereas the rotate-and-mask
// instructions take two cycles, and it would be better to bias toward
// andi/andis in break-even cases.

unsigned NumAndInsts = (unsigned) NeedsRotate +
(unsigned) (ANDIMask != 0) +
(unsigned) (ANDISMask != 0) +
(unsigned) (ANDIMask != 0 && ANDISMask != 0) +
(unsigned) (bool) Res;
if (NumAndInsts >= VRI.NumGroups)
continue;

SDValue VRot;
if (VRI.RLAmt) {
SDValue Ops[] =
{ VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
Ops), 0);
} else {
VRot = VRI.V;
}

SDValue ANDIVal, ANDISVal;
if (ANDIMask != 0)
ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
VRot, getI32Imm(ANDIMask)), 0);
if (ANDISMask != 0)
ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
VRot, getI32Imm(ANDISMask)), 0);

SDValue TotalVal;
if (!ANDIVal)
TotalVal = ANDISVal;
else if (!ANDISVal)
TotalVal = ANDIVal;
else
TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
ANDIVal, ANDISVal), 0);

if (!Res)
Res = TotalVal;
else
Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
Res, TotalVal), 0);

// Now, remove all groups with this underlying value and rotation
// factor.
for (auto I = BitGroups.begin(); I != BitGroups.end();) {
if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
I = BitGroups.erase(I);
else
++I;
}
}
}

// Instruction selection for the 32-bit case.
SDNode *Select32(SDNode *N) {
SDLoc dl(N);
SDValue Res;

// Take care of cases that should use andi/andis first.
SelectAndParts32(N, Res);

// If we've not yet selected a 'starting' instruction, and we have no zeros
// to fill in, select the (Value, RLAmt) with the highest priority (largest
// number of groups), and start with this rotated value.
if (!HasZeros && !Res) {
ValueRotInfo &VRI = ValueRotsVec[0];
if (VRI.RLAmt) {
SDValue Ops[] =
{ VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
} else {
Res = VRI.V;
}

// Now, remove all groups with this underlying value and rotation factor.
for (auto I = BitGroups.begin(); I != BitGroups.end();) {
if (I->V == VRI.V && I->RLAmt == VRI.RLAmt)
I = BitGroups.erase(I);
else
++I;
}
}

// Insert the other groups (one at a time).
for (auto &BG : BitGroups) {
if (!Res.getNode()) {
SDValue Ops[] =
{ BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
getI32Imm(Bits.size() - BG.StartIdx - 1) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
} else {
SDValue Ops[] =
{ Res, BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
getI32Imm(Bits.size() - BG.StartIdx - 1) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
}
}

return Res.getNode();
}

SmallVector<ValueBit, 64> Bits;

bool HasZeros;
SmallVector<unsigned, 64> RLAmt;

SmallVector<BitGroup, 16> BitGroups;

DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
SmallVector<ValueRotInfo, 16> ValueRotsVec;

SelectionDAG *CurDAG;

public:
BitPermutationSelector(SelectionDAG *DAG)
: CurDAG(DAG) {}

// Here we try to match complex bit permutations into a set of
// rotate-and-shift/shift/and/or instructions, using a set of heuristics
// known to produce optimial code for common cases (like i32 byte swapping).
SDNode *Select(SDNode *N) {
Bits.resize(N->getValueType(0).getSizeInBits());
if (!getValueBits(SDValue(N, 0), Bits))
return nullptr;

DEBUG(dbgs() << "Considering bit-permutation-based instruction"
" selection for: ");
DEBUG(N->dump(CurDAG));

// Fill it RLAmt and set HasZeros.
computeRotationAmounts();

// Fill in BitGroups.
collectBitGroups();
if (BitGroups.empty())
return nullptr;

// Fill in ValueRotsVec.
collectValueRotInfo();

if (Bits.size() == 32) {
return Select32(N);
} else {
assert(Bits.size() == 64 && "Not 64 bits here?");
// TODO: The 64-bit case!
}

return nullptr;
}
};
} // anonymous namespace

SDNode *PPCDAGToDAGISel::SelectBitPermutation(SDNode *N) {
if (N->getValueType(0) != MVT::i32 &&
N->getValueType(0) != MVT::i64)
return nullptr;

switch (N->getOpcode()) {
default: break;
case ISD::ROTL:
case ISD::SHL:
case ISD::SRL:
case ISD::AND:
case ISD::OR: {
BitPermutationSelector BPS(CurDAG);
return BPS.Select(N);
}
}

return nullptr;
}

/// SelectCC - Select a comparison of the specified values with the specified
/// condition code, returning the CR# of the expression.
SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
Expand Down Expand Up @@ -947,6 +1423,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
N->getOperand(1).getOpcode() == ISD::TargetConstant)
llvm_unreachable("Invalid ADD with TargetConstant operand");

// Try matching complex bit permutations before doing anything else.
if (SDNode *NN = SelectBitPermutation(N))
return NN;

switch (N->getOpcode()) {
default: break;

Expand Down
55 changes: 0 additions & 55 deletions llvm/lib/Target/PowerPC/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -533,32 +533,6 @@ _foo:

===-------------------------------------------------------------------------===

We compile:

unsigned test6(unsigned x) {
return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
}

into:

_test6:
lis r2, 255
rlwinm r3, r3, 16, 0, 31
ori r2, r2, 255
and r3, r3, r2
blr

GCC gets it down to:

_test6:
rlwinm r0,r3,16,8,15
rlwinm r3,r3,16,24,31
or r3,r3,r0
blr


===-------------------------------------------------------------------------===

Consider a function like this:

float foo(float X) { return X + 1234.4123f; }
Expand Down Expand Up @@ -655,35 +629,6 @@ _bar:

===-------------------------------------------------------------------------===

We currently compile 32-bit bswap:

declare i32 @llvm.bswap.i32(i32 %A)
define i32 @test(i32 %A) {
%B = call i32 @llvm.bswap.i32(i32 %A)
ret i32 %B
}

to:

_test:
rlwinm r2, r3, 24, 16, 23
slwi r4, r3, 24
rlwimi r2, r3, 8, 24, 31
rlwimi r4, r3, 8, 8, 15
rlwimi r4, r2, 0, 16, 31
mr r3, r4
blr

it would be more efficient to produce:

_foo: mr r0,r3
rlwinm r3,r3,8,0xffffffff
rlwimi r3,r0,24,0,7
rlwimi r3,r0,24,16,23
blr

===-------------------------------------------------------------------------===

test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:

__ZNK4llvm5APInt17countLeadingZerosEv:
Expand Down
40 changes: 40 additions & 0 deletions llvm/test/CodeGen/PowerPC/bperm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"

; Function Attrs: nounwind readnone
define zeroext i32 @bs4(i32 zeroext %a) #0 {
entry:
%0 = tail call i32 @llvm.bswap.i32(i32 %a)
ret i32 %0

; CHECK-LABEL: @bs4
; CHECK: rlwinm [[REG1:[0-9]+]], 3, 8, 0, 31
; CHECK: rlwimi [[REG1]], 3, 24, 16, 23
; CHECK: rlwimi [[REG1]], 3, 24, 0, 7
; CHECK: mr 3, [[REG1]]
; CHECK: blr
}

; Function Attrs: nounwind readnone
define zeroext i32 @test6(i32 zeroext %x) #0 {
entry:
%and = lshr i32 %x, 16
%shr = and i32 %and, 255
%and1 = shl i32 %x, 16
%shl = and i32 %and1, 16711680
%or = or i32 %shr, %shl
ret i32 %or

; CHECK-LABEL: @test6
; CHECK: rlwinm [[REG1:[0-9]+]], 3, 16, 24, 31
; CHECK: rlwimi [[REG1]], 3, 16, 8, 15
; CHECK: mr 3, [[REG1]]
; CHECK: blr
}

; Function Attrs: nounwind readnone
declare i32 @llvm.bswap.i32(i32) #0

attributes #0 = { nounwind readnone }

6 changes: 2 additions & 4 deletions llvm/test/CodeGen/PowerPC/rlwimi-and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,9 @@ codeRepl17: ; preds = %codeRepl4
store i16 %rvml38.sroa.0.0.insert.insert, i16* undef, align 2
unreachable

; FIXME: the SLWI could be folded into the RLWIMI to give a rotate of 8.
; CHECK: @test
; CHECK-DAG: slwi [[R1:[0-9]+]], {{[0-9]+}}, 31
; CHECK-DAG: rlwinm [[R2:[0-9]+]], {{[0-9]+}}, 0, 31, 31
; CHECK: rlwimi [[R2]], [[R1]], 9, 23, 23
; CHECK: rlwinm [[R1:[0-9]+]], {{[0-9]+}}, 0, 31, 31
; CHECK: rlwimi [[R1]], {{[0-9]+}}, 8, 23, 23

codeRepl29: ; preds = %codeRepl1
unreachable
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/PowerPC/rlwimi2.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; All of these ands and shifts should be folded into rlwimi's
; RUN: llc < %s -march=ppc32 -o %t
; RUN: grep rlwimi %t | count 3
; RUN: grep srwi %t | count 1
; RUN: grep rlwimi %t | count 4
; RUN: not grep srwi %t
; RUN: not grep slwi %t

define i16 @test1(i32 %srcA, i32 %srcB, i32 %alpha) nounwind {
Expand Down