Skip to content

Commit

Permalink
Detecte vector reduction operations just before instruction selection.
Browse files Browse the repository at this point in the history
This patch detects vector reductions before instruction selection. Vector
reductions are vectorized reduction operations, and for such operations we have
freedom to reorganize the elements of the result as long as the reduction of them
stay unchanged. This will enable some reduction pattern recognition during
instruction combine such as SAD/dot-product on X86. A flag is added to
SDNodeFlags to mark those vector reduction nodes to be checked during instruction
combine.

To detect those vector reductions, we search def-use chains starting from the
given instruction, and check if all uses fall into two categories:

1. Reduction with another vector.
2. Reduction on all elements.

in which 2 is detected by recognizing the pattern that the loop vectorizer
generates to reduce all elements in the vector outside of the loop, which
includes several ShuffleVector and one ExtractElement instructions.


Differential revision: http://reviews.llvm.org/D15250

llvm-svn: 261070
  • Loading branch information
Cong Hou committed Feb 17, 2016
1 parent a2b1f45 commit bbd4e3b
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/SelectionDAGNodes.h
Expand Up @@ -328,6 +328,7 @@ struct SDNodeFlags {
bool NoInfs : 1;
bool NoSignedZeros : 1;
bool AllowReciprocal : 1;
bool VectorReduction : 1;

public:
/// Default constructor turns off all optimization flags.
Expand All @@ -340,6 +341,7 @@ struct SDNodeFlags {
NoInfs = false;
NoSignedZeros = false;
AllowReciprocal = false;
VectorReduction = false;
}

// These are mutators for each flag.
Expand All @@ -351,6 +353,7 @@ struct SDNodeFlags {
void setNoInfs(bool b) { NoInfs = b; }
void setNoSignedZeros(bool b) { NoSignedZeros = b; }
void setAllowReciprocal(bool b) { AllowReciprocal = b; }
void setVectorReduction(bool b) { VectorReduction = b; }

// These are accessors for each flag.
bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
Expand All @@ -361,6 +364,7 @@ struct SDNodeFlags {
bool hasNoInfs() const { return NoInfs; }
bool hasNoSignedZeros() const { return NoSignedZeros; }
bool hasAllowReciprocal() const { return AllowReciprocal; }
bool hasVectorReduction() const { return VectorReduction; }

/// Return a raw encoding of the flags.
/// This function should only be used to add data to the NodeID value.
Expand Down
126 changes: 126 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Expand Up @@ -2308,13 +2308,133 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
visitBinary(I, ISD::FSUB);
}

/// Checks if the given instruction performs a vector reduction, in which case
/// we have the freedom to alter the elements in the result as long as the
/// reduction of them stays unchanged.
static bool isVectorReductionOp(const User *I) {
const Instruction *Inst = dyn_cast<Instruction>(I);
if (!Inst || !Inst->getType()->isVectorTy())
return false;

auto OpCode = Inst->getOpcode();
switch (OpCode) {
case Instruction::Add:
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
break;
case Instruction::FAdd:
case Instruction::FMul:
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
if (FPOp->getFastMathFlags().unsafeAlgebra())
break;
// Fall through.
default:
return false;
}

unsigned ElemNum = Inst->getType()->getVectorNumElements();
unsigned ElemNumToReduce = ElemNum;

// Do DFS search on the def-use chain from the given instruction. We only
// allow four kinds of operations during the search until we reach the
// instruction that extracts the first element from the vector:
//
// 1. The reduction operation of the same opcode as the given instruction.
//
// 2. PHI node.
//
// 3. ShuffleVector instruction together with a reduction operation that
// does a partial reduction.
//
// 4. ExtractElement that extracts the first element from the vector, and we
// stop searching the def-use chain here.
//
// 3 & 4 above perform a reduction on all elements of the vector. We push defs
// from 1-3 to the stack to continue the DFS. The given instruction is not
// a reduction operation if we meet any other instructions other than those
// listed above.

SmallVector<const User *, 16> UsersToVisit{Inst};
SmallPtrSet<const User *, 16> Visited;
bool ReduxExtracted = false;

while (!UsersToVisit.empty()) {
auto User = UsersToVisit.back();
UsersToVisit.pop_back();
if (!Visited.insert(User).second)
continue;

for (const auto &U : User->users()) {
auto Inst = dyn_cast<Instruction>(U);
if (!Inst)
return false;

if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
return false;
UsersToVisit.push_back(U);
} else if (const ShuffleVectorInst *ShufInst =
dyn_cast<ShuffleVectorInst>(U)) {
// Detect the following pattern: A ShuffleVector instruction together
// with a reduction that do partial reduction on the first and second
// ElemNumToReduce / 2 elements, and store the result in
// ElemNumToReduce / 2 elements in another vector.

if (ElemNumToReduce == 1)
return false;
if (!isa<UndefValue>(U->getOperand(1)))
return false;
for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
return false;
for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
if (ShufInst->getMaskValue(i) != -1)
return false;

// There is only one user of this ShuffleVector instruction, which must
// be a reduction operation.
if (!U->hasOneUse())
return false;

auto U2 = dyn_cast<Instruction>(*U->user_begin());
if (!U2 || U2->getOpcode() != OpCode)
return false;

// Check operands of the reduction operation.
if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
(U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
UsersToVisit.push_back(U2);
ElemNumToReduce /= 2;
} else
return false;
} else if (isa<ExtractElementInst>(U)) {
// At this moment we should have reduced all elements in the vector.
if (ElemNumToReduce != 1)
return false;

const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
if (!Val || Val->getZExtValue() != 0)
return false;

ReduxExtracted = true;
} else
return false;
}
}
return ReduxExtracted;
}

void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));

bool nuw = false;
bool nsw = false;
bool exact = false;
bool vec_redux = false;
FastMathFlags FMF;

if (const OverflowingBinaryOperator *OFBinOp =
Expand All @@ -2328,10 +2448,16 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
FMF = FPOp->getFastMathFlags();

if (isVectorReductionOp(&I)) {
vec_redux = true;
DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
}

SDNodeFlags Flags;
Flags.setExact(exact);
Flags.setNoSignedWrap(nsw);
Flags.setNoUnsignedWrap(nuw);
Flags.setVectorReduction(vec_redux);
if (EnableFMFInDAG) {
Flags.setAllowReciprocal(FMF.allowReciprocal());
Flags.setNoInfs(FMF.noInfs());
Expand Down
85 changes: 85 additions & 0 deletions llvm/test/CodeGen/Generic/vector-redux.ll
@@ -0,0 +1,85 @@
; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
; REQUIRES: asserts

@a = global [1024 x i32] zeroinitializer, align 16

define float @reduce_add_float(float* nocapture readonly %a) {
; CHECK-LABEL: reduce_add_float
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
; CHECK: Detected a reduction operation: {{.*}} fadd fast
;
entry:
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
%vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
%0 = getelementptr inbounds float, float* %a, i64 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = getelementptr float, float* %0, i64 4
%3 = bitcast float* %2 to <4 x float>*
%wide.load10 = load <4 x float>, <4 x float>* %3, align 4
%4 = fadd fast <4 x float> %wide.load, %vec.phi
%5 = fadd fast <4 x float> %wide.load10, %vec.phi9
%index.next = add nuw nsw i64 %index, 8
%6 = getelementptr inbounds float, float* %a, i64 %index.next
%7 = bitcast float* %6 to <4 x float>*
%wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
%8 = getelementptr float, float* %6, i64 4
%9 = bitcast float* %8 to <4 x float>*
%wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
%10 = fadd fast <4 x float> %wide.load.1, %4
%11 = fadd fast <4 x float> %wide.load10.1, %5
%index.next.1 = add nsw i64 %index, 16
%12 = getelementptr inbounds float, float* %a, i64 %index.next.1
%13 = bitcast float* %12 to <4 x float>*
%wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
%14 = getelementptr float, float* %12, i64 4
%15 = bitcast float* %14 to <4 x float>*
%wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
%16 = fadd fast <4 x float> %wide.load.2, %10
%17 = fadd fast <4 x float> %wide.load10.2, %11
%index.next.2 = add nsw i64 %index, 24
%18 = getelementptr inbounds float, float* %a, i64 %index.next.2
%19 = bitcast float* %18 to <4 x float>*
%wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
%20 = getelementptr float, float* %18, i64 4
%21 = bitcast float* %20 to <4 x float>*
%wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
%22 = fadd fast <4 x float> %wide.load.3, %16
%23 = fadd fast <4 x float> %wide.load10.3, %17
%index.next.3 = add nsw i64 %index, 32
%24 = getelementptr inbounds float, float* %a, i64 %index.next.3
%25 = bitcast float* %24 to <4 x float>*
%wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
%26 = getelementptr float, float* %24, i64 4
%27 = bitcast float* %26 to <4 x float>*
%wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
%28 = fadd fast <4 x float> %wide.load.4, %22
%29 = fadd fast <4 x float> %wide.load10.4, %23
%index.next.4 = add nsw i64 %index, 40
%30 = icmp eq i64 %index.next.4, 1000
br i1 %30, label %middle.block, label %vector.body

middle.block:
%.lcssa15 = phi <4 x float> [ %29, %vector.body ]
%.lcssa = phi <4 x float> [ %28, %vector.body ]
%bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
%rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
%rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
%31 = extractelement <4 x float> %bin.rdx13, i32 0
ret float %31
}

0 comments on commit bbd4e3b

Please sign in to comment.