165 changes: 151 additions & 14 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}

///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
switch (Op) {
case Instruction::FAdd:
return Instruction::FSub;
case Instruction::FSub:
return Instruction::FAdd;
case Instruction::Add:
return Instruction::Sub;
case Instruction::Sub:
return Instruction::Add;
default:
return 0;
}
}

///\returns bool representing if Opcode \p Op can be part
/// of an alternate sequence which can later be merged as
/// a ShuffleVector instruction.
static bool canCombineAsAltInst(unsigned Op) {
if (Op == Instruction::FAdd || Op == Instruction::FSub ||
Op == Instruction::Sub || Op == Instruction::Add)
return true;
return false;
}

/// \returns ShuffleVector instruction if intructions in \p VL have
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
static unsigned isAltInst(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Opcode = I0->getOpcode();
unsigned AltOpcode = getAltOpcode(Opcode);
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
return 0;
}
return Instruction::ShuffleVector;
}

/// \returns The opcode if all of the Instructions in \p VL have the same
/// opcode, or zero.
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
Expand All @@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
unsigned Opcode = I0->getOpcode();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || Opcode != I->getOpcode())
if (!I || Opcode != I->getOpcode()) {
if (canCombineAsAltInst(Opcode) && i == 1)
return isAltInst(VL);
return 0;
}
}
return Opcode;
}
Expand Down Expand Up @@ -377,6 +422,7 @@ class BoUpSLP {

/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();

private:
struct TreeEntry;

Expand Down Expand Up @@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool SameTy = getSameType(VL); (void)SameTy;
bool isAltShuffle = false;
assert(SameTy && "Invalid types!");

if (Depth == RecursionMaxDepth) {
Expand All @@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
newTreeEntry(VL, false);
return;
}
unsigned Opcode = getSameOpcode(VL);

// Check that this shuffle vector refers to the alternate
// sequence of opcodes.
if (Opcode == Instruction::ShuffleVector) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Op = I0->getOpcode();
if (Op != Instruction::ShuffleVector)
isAltShuffle = true;
}

// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
!getSameOpcode(VL)) {
if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
Expand Down Expand Up @@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

unsigned Opcode = getSameOpcode(VL);

// Check if it is safe to sink the loads or the stores.
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
Instruction *Last = getLastInstruction(VL);
Expand Down Expand Up @@ -1057,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
}
return;
}
case Instruction::ShuffleVector: {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!isAltShuffle) {
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));

buildTree_rec(Operands, Depth + 1);
}
return;
}
default:
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
Expand All @@ -1080,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return getGatherCost(E->Scalars);
}

assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
"Invalid VL");
unsigned Opcode = getSameOpcode(VL);
assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
unsigned Opcode = VL0->getOpcode();
switch (Opcode) {
case Instruction::PHI: {
return 0;
Expand Down Expand Up @@ -1242,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

return VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
int ScalarCost = 0;
int VecCost = 0;
for (unsigned i = 0; i < VL.size(); ++i) {
Instruction *I = cast<Instruction>(VL[i]);
if (!I)
break;
ScalarCost +=
TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
Instruction *I0 = cast<Instruction>(VL[0]);
VecCost =
TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
Instruction *I1 = cast<Instruction>(VL[1]);
VecCost +=
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
return VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
}
Expand Down Expand Up @@ -1522,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
return Gather(E->Scalars, VecTy);
}

unsigned Opcode = VL0->getOpcode();
assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
unsigned Opcode = getSameOpcode(E->Scalars);

switch (Opcode) {
case Instruction::PHI: {
Expand Down Expand Up @@ -1797,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = V;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);

Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);

if (Value *V = alreadyVectorized(E->Scalars))
return V;

// Create a vector of LHS op1 RHS
BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);

// Create a vector of LHS op2 RHS
Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);

// Create appropriate shuffle to take alternative operations from
// the vector.
std::vector<Constant *> Mask(E->Scalars.size());
unsigned e = E->Scalars.size();
for (unsigned i = 0; i < e; ++i) {
if (i & 1)
Mask[i] = Builder.getInt32(e + i);
else
Mask[i] = Builder.getInt32(i);
}

Value *ShuffleMask = ConstantVector::get(Mask);

Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);

return V;
}
default:
llvm_unreachable("unknown inst");
}
Expand Down Expand Up @@ -1865,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() {
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];

// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
Expand Down Expand Up @@ -2049,7 +2187,6 @@ struct SLPVectorizer : public FunctionPass {
for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
e = po_end(&F.getEntryBlock()); it != e; ++it) {
BasicBlock *BB = *it;

// Vectorize trees that end at stores.
if (unsigned count = collectStores(BB, R)) {
(void)count;
Expand Down
181 changes: 181 additions & 0 deletions llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@b = common global [4 x i32] zeroinitializer, align 16
@c = common global [4 x i32] zeroinitializer, align 16
@d = common global [4 x i32] zeroinitializer, align 16
@e = common global [4 x i32] zeroinitializer, align 16
@a = common global [4 x i32] zeroinitializer, align 16
@fb = common global [4 x float] zeroinitializer, align 16
@fc = common global [4 x float] zeroinitializer, align 16
@fa = common global [4 x float] zeroinitializer, align 16

; CHECK-LABEL: @addsub
; CHECK: %5 = add <4 x i32> %3, %4
; CHECK: %6 = add <4 x i32> %2, %5
; CHECK: %7 = sub <4 x i32> %2, %5
; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>

; Function Attrs: nounwind uwtable
define void @addsub() #0 {
entry:
%0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
%1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
%add = add nsw i32 %0, %1
%2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
%3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
%add1 = add nsw i32 %2, %3
%add2 = add nsw i32 %add, %add1
store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
%4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
%5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
%add3 = add nsw i32 %4, %5
%6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
%7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
%add4 = add nsw i32 %6, %7
%sub = sub nsw i32 %add3, %add4
store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
%8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
%9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
%add5 = add nsw i32 %8, %9
%10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
%11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
%add6 = add nsw i32 %10, %11
%add7 = add nsw i32 %add5, %add6
store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
%12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
%13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
%add8 = add nsw i32 %12, %13
%14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
%15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
%add9 = add nsw i32 %14, %15
%sub10 = sub nsw i32 %add8, %add9
store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
ret void
}

; CHECK-LABEL: @subadd
; CHECK: %5 = add <4 x i32> %3, %4
; CHECK: %6 = sub <4 x i32> %2, %5
; CHECK: %7 = add <4 x i32> %2, %5
; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>

; Function Attrs: nounwind uwtable
define void @subadd() #0 {
entry:
%0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
%1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
%add = add nsw i32 %0, %1
%2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
%3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
%add1 = add nsw i32 %2, %3
%sub = sub nsw i32 %add, %add1
store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
%4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
%5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
%add2 = add nsw i32 %4, %5
%6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
%7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
%add3 = add nsw i32 %6, %7
%add4 = add nsw i32 %add2, %add3
store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
%8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
%9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
%add5 = add nsw i32 %8, %9
%10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
%11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
%add6 = add nsw i32 %10, %11
%sub7 = sub nsw i32 %add5, %add6
store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
%12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
%13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
%add8 = add nsw i32 %12, %13
%14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
%15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
%add9 = add nsw i32 %14, %15
%add10 = add nsw i32 %add8, %add9
store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
ret void
}

; CHECK-LABEL: @faddfsub
; CHECK: %2 = fadd <4 x float> %0, %1
; CHECK: %3 = fsub <4 x float> %0, %1
; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; Function Attrs: nounwind uwtable
define void @faddfsub() #0 {
entry:
%0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
%1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
%add = fadd float %0, %1
store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
%2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
%3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
%sub = fsub float %2, %3
store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
%4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
%5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
%add1 = fadd float %4, %5
store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
%6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
%7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
%sub2 = fsub float %6, %7
store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
ret void
}

; CHECK-LABEL: @fsubfadd
; CHECK: %2 = fsub <4 x float> %0, %1
; CHECK: %3 = fadd <4 x float> %0, %1
; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; Function Attrs: nounwind uwtable
define void @fsubfadd() #0 {
entry:
%0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
%1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
%sub = fsub float %0, %1
store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
%2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
%3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
%add = fadd float %2, %3
store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
%4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
%5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
%sub1 = fsub float %4, %5
store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
%6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
%7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
%add2 = fadd float %6, %7
store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
ret void
}

; CHECK-LABEL: @No_faddfsub
; CHECK-NOT: fadd <4 x float>
; CHECK-NOT: fsub <4 x float>
; CHECK-NOT: shufflevector
; Function Attrs: nounwind uwtable
define void @No_faddfsub() #0 {
entry:
%0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
%1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
%add = fadd float %0, %1
store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
%2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
%3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
%add1 = fadd float %2, %3
store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
%4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
%5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
%add2 = fadd float %4, %5
store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
%6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
%7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
%sub = fsub float %6, %7
store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
ret void
}

attributes #0 = { nounwind }