Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;

unsigned Ratio =
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();

// A ratio of 1 would mean it's similar to a regular add, e.g.
// v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
// <=> add v4i64 %acc, %vec
if (Ratio == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than checking for ratio=1, you should either replace

  1. https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp#L323 with getArithmeticInstructionCost (I believe this line is simply checking for the simple add reduction)

OR

  1. https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp#L3552 with getAirthmeticInstructionCost

auto *T = VectorType::get(AccumType, VF);
return getArithmeticInstrCost(Opcode, T, CostKind) +
(BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
}

if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
(!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
Expand All @@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (IsUSDot && !ST->hasMatMulInt8())
return Invalid;

unsigned Ratio =
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
if (VF.getKnownMinValue() <= Ratio)
return Invalid;

Expand Down
9 changes: 1 addition & 8 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8171,15 +8171,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
}

VPValue *Cond = nullptr;
if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
assert((ReductionOpcode == Instruction::Add ||
ReductionOpcode == Instruction::Sub) &&
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent()))
Cond = getBlockInMask(Builder.getInsertBlock());
VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
ScaleFactor, Reduction);
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }

void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,12 +395,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
assert(getOpcode() == Instruction::Add &&
"Unhandled partial reduction opcode");

Value *BinOpVal = State.get(getOperand(1));
Value *PhiVal = State.get(getOperand(0));
Value *BinOpVal = State.get(getVecOp());
Value *PhiVal = State.get(getChainOp());
assert(PhiVal && BinOpVal && "Phi and Mul must be set");

Type *RetTy = PhiVal->getType();

if (isConditional()) {
Value *Cond = State.get(getCondOp());
Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero);
}

CallInst *V =
Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
{PhiVal, BinOpVal}, nullptr, "partial.reduce");
Expand Down
67 changes: 63 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
using namespace llvm;
using namespace VPlanPatternMatch;

#define DEBUG_TYPE "loop-vectorize"

static cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));
Expand Down Expand Up @@ -3761,7 +3763,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,

/// This function tries to create abstract recipes from the reduction recipe for
/// following optimizations and cost estimation.
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
static bool tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
VPCostContext &Ctx,
VFRange &Range) {
VPExpressionRecipe *AbstractR = nullptr;
Expand All @@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
return;
return false;

AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
return true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add Red->eraseFromParent(); before this ?

}

/// Lower a partial reduction back to a regular reduction, by
/// changing the in-loop partial reduction to a binop and removing
/// the scale factor from the PHI node.
static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
VPCostContext &Ctx) {
VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Acc)) {
PhiR->setVFScaleFactor(1);

// We also need to update the scale factor of the reduction-start-vector
// operand.
VPValue *StartV, *IdentityV;
if (!match(PhiR->getOperand(0),
m_VPInstruction<VPInstruction::ReductionStartVector>(
m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue())))
llvm_unreachable("Unexpected operand for a partial reduction");
Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
cast<VPInstruction>(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
}

if (auto *R = dyn_cast<VPPartialReductionRecipe>(Acc))
if (R->getVFScaleFactor() != 1)
lowerPartialReduction(Plan, R, Ctx);

LLVM_DEBUG(
dbgs() << "LV: Lowering " << *Red
<< " back to regular reduction, because it is not profitable\n");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want an a before regular.


// Lower the partial reduction to a regular binop.
VPBuilder Builder(Red);
VPInstruction *Add = Builder.createNaryOp(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
{Red->getChainOp(), Red->getVecOp()});
if (Red->isConditional())
Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());

Red->replaceAllUsesWith(Add);
Red->eraseFromParent();
}

void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
auto *Red = dyn_cast<VPReductionRecipe>(&R);
if (!Red)
continue;

if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If 'Red' is converted to AbstractRecipe, Red should be null. Can you check for Red instead of returning bool from the tryToCreateAbstractReductionRecipe ?

isa<VPPartialReductionRecipe>(Red)) {
// If there isn't a profitable VPExpression for a partial reduction,
// then that suggests using a partial reduction is not profitable
// for this VPlan. It seems better to resort to a regular (middle-block)
// reduction, so that the this plan is still profitable to consider.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra the here.

// Otherwise, the plan might be discarded in favour of a smaller VF.
//
// FIXME: There's a lot to unpick when it comes to partial
// reductions, but this should provide a temporary stop-gap until we
// reimplement the logic for creating partial reductions.
lowerPartialReduction(Plan, cast<VPPartialReductionRecipe>(Red), Ctx);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should only call this if the reduction actually is partial, otherwise we'll waste some time essentially doing nothing in the lower function.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be great if we can lower this here, basically after this

}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
Expand All @@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
;
entry:
Expand Down
Loading