Skip to content

Commit

Permalink
[X86] Recognize constant arrays with special values and replace loads…
Browse files Browse the repository at this point in the history
… from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.

Recognize constant arrays with the following values:
  0x0, 0x1, 0x3, 0x7, 0xF, 0x1F, .... , 2^(size - 1) -1
where //size// is the size of the array.

the result of a load with index //idx// from this array is equivalent to the result of the following:
  (0xFFFFFFFF >> (sub 32, idx))             (assuming the array of type 32-bit integer).

And the result of an 'AND' operation on the returned value of such a load and another input, is exactly equivalent to the X86 BZHI instruction behavior.

See test cases in the LIT test for better understanding.

Differential Revision: https://reviews.llvm.org/D34141

llvm-svn: 320481
  • Loading branch information
aymanmusa committed Dec 12, 2017
1 parent 2dd9835 commit c2eed92
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 22 deletions.
121 changes: 121 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -33066,6 +33066,124 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(N->getValueType(0), Shift);
}

// Get the index node from the lowered DAG of a GEP IR instruction with one
// indexing dimension.
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
if (Ld->isIndexed())
return SDValue();

SDValue Base = Ld->getBasePtr();

if (Base.getOpcode() != ISD::ADD)
return SDValue();

SDValue ShiftedIndex = Base.getOperand(0);

if (ShiftedIndex.getOpcode() != ISD::SHL)
return SDValue();

return ShiftedIndex.getOperand(0);

}

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
switch (VT.getSizeInBits()) {
default: return false;
case 64: return Subtarget.is64Bit() ? true : false;
case 32: return true;
}
}
return false;
}

// This function recognizes cases where X86 bzhi instruction can replace and
// 'and-load' sequence.
// In case of loading integer value from an array of constants which is defined
// as follows:
//
// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
//
// then applying a bitwise and on the result with another input.
// It's equivalent to performing bzhi (zero high bits) on the input, with the
// same index of the load.
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Node->getSimpleValueType(0);
SDLoc dl(Node);

// Check if subtarget has BZHI instruction for the node's type
if (!hasBZHI(Subtarget, VT))
return SDValue();

// Try matching the pattern for both operands.
for (unsigned i = 0; i < 2; i++) {
SDValue N = Node->getOperand(i);
LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

// continue if the operand is not a load instruction
if (!Ld)
return SDValue();

const Value *MemOp = Ld->getMemOperand()->getValue();

if (!MemOp)
return SDValue();

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

Constant *Init = GV->getInitializer();
Type *Ty = Init->getType();
if (!isa<ConstantDataArray>(Init) ||
!Ty->getArrayElementType()->isIntegerTy() ||
Ty->getArrayElementType()->getScalarSizeInBits() !=
VT.getSizeInBits() ||
Ty->getArrayNumElements() >
Ty->getArrayElementType()->getScalarSizeInBits())
continue;

// Check if the array's constant elements are suitable to our case.
uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
bool ConstantsMatch = true;
for (uint64_t j = 0; j < ArrayElementCount; j++) {
ConstantInt *Elem =
dyn_cast<ConstantInt>(Init->getAggregateElement(j));
if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
ConstantsMatch = false;
break;
}
}
if (!ConstantsMatch)
continue;

// Do the transformation (For 32-bit type):
// -> (and (load arr[idx]), inp)
// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
// that will be replaced with one bzhi instruction.
SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);

// Get the Node which indexes into the array.
SDValue Index = getIndexFromUnindexedLoad(Ld);
if (!Index)
return SDValue();
Index = DAG.getZExtOrTrunc(Index, dl, VT);

SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
}
}
}
}
return SDValue();
}

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
Expand Down Expand Up @@ -33094,6 +33212,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;

// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
Expand Down
36 changes: 14 additions & 22 deletions llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
Expand Up @@ -10,17 +10,14 @@
define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
; CHECK-LABEL: f32_bzhi:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movslq %esi, %rax
; CHECK-NEXT: andl fill_table32(,%rax,4), %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: ret{{[l|q]}}
; CHECK-NEXT: bzhil %esi, %edi, %eax
; CHECK-NEXT: retq
;
; CHECK32-LABEL: f32_bzhi:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movl fill_table32(,%eax,4), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: ret{{[l|q]}}
; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: retl
entry:
%idxprom = sext i32 %y to i64
%arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
Expand All @@ -32,17 +29,14 @@ entry:
define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
; CHECK-LABEL: f32_bzhi_partial:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movslq %esi, %rax
; CHECK-NEXT: andl fill_table32_partial(,%rax,4), %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: ret{{[l|q]}}
; CHECK-NEXT: bzhil %esi, %edi, %eax
; CHECK-NEXT: retq
;
; CHECK32-LABEL: f32_bzhi_partial:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movl fill_table32_partial(,%eax,4), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: ret{{[l|q]}}
; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: retl
entry:
%idxprom = sext i32 %y to i64
%arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
Expand All @@ -54,9 +48,8 @@ entry:
define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
; CHECK-LABEL: f64_bzhi:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andq fill_table64(,%rsi,8), %rdi
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: ret{{[l|q]}}
; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
; CHECK-NEXT: retq
;
; CHECK32-LABEL: f64_bzhi:
; CHECK32: # %bb.0: # %entry
Expand All @@ -65,7 +58,7 @@ define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
; CHECK32-NEXT: ret{{[l|q]}}
; CHECK32-NEXT: retl
entry:
%arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
%0 = load i64, i64* %arrayidx, align 8
Expand All @@ -76,9 +69,8 @@ entry:
define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
; CHECK-LABEL: f64_bzhi_partial:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andq fill_table64_partial(,%rsi,8), %rdi
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: ret{{[l|q]}}
; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
; CHECK-NEXT: retq
;
; CHECK32-LABEL: f64_bzhi_partial:
; CHECK32: # %bb.0: # %entry
Expand All @@ -87,7 +79,7 @@ define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
; CHECK32-NEXT: ret{{[l|q]}}
; CHECK32-NEXT: retl
entry:
%arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
%0 = load i64, i64* %arrayidx, align 8
Expand Down

0 comments on commit c2eed92

Please sign in to comment.