Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8197,8 +8197,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);

if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
if (auto PartialRed =
tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
return PartialRed;
}
Comment on lines -8200 to +8204
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bad change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is intentional? tryToCreatePartialReduction can now return nullptr on line 8185 so I presume we need to check it here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we need to account for tryToCreatePartialReduction now possibly returning nullptr


if (!shouldWiden(Instr, Range))
return nullptr;
Expand Down Expand Up @@ -8232,6 +8235,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);

if (ScaleFactor !=
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does the scale-factor of the accumulator (PHI) not match that of the reduction? Is there something missing elsewhere that causes this to not match?

Possibly unrelated, but when I print the values of BinOp/Accumulator, I see that BinOp = WIDEN_REDUCTION-PHI and Accumulator = WIDEN-CAST, which is then swapped above to result in BinOp = WIDEN-CAST. That does not look like a binary-op to me, so. Is this poorly named or does the code actually assume a 'dot-product'-like operation which is not what it gets?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does the scale-factor of the accumulator (PHI) not match that of the reduction? Is there something missing elsewhere that causes this to not match?

Ths can happen when there's a chain and the first entry can use a partial reduction but the second entry in the chain cannot.

Possibly unrelated, but when I print the values of BinOp/Accumulator, I see that BinOp = WIDEN_REDUCTION-PHI and Accumulator = WIDEN-CAST, which is then swapped above to result in BinOp = WIDEN-CAST. That does not look like a binary-op to me, so. Is this poorly named or does the code actually assume a 'dot-product'-like operation which is not what it gets?

I think it is poorly named, as this also supports using dot product instructions for widening adds, by multiplying with 1

vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
return nullptr;

unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
Expand Down
17 changes: 2 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,20 +395,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
return Base::properlyDominates(ParentA, ParentB);
}

/// Get the VF scaling factor applied to the recipe's output, if the recipe has
/// one.
static unsigned getVFScaleFactor(VPValue *R) {
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
return RR->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
VPInstruction::ReductionStartVector) &&
"getting scaling factor of reduction-start-vector not implemented yet");
return 1;
}

bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs) const {
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
Expand Down Expand Up @@ -571,7 +557,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
unsigned ScaleFactor = getVFScaleFactor(VPV);
unsigned ScaleFactor =
vputils::getVFScaleFactor(VPV->getDefiningRecipe());
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,20 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
}

unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
if (!R)
return 1;
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
return RR->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
VPInstruction::ReductionStartVector) &&
"getting scaling factor of reduction-start-vector not implemented yet");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this now result in the compiler (with asserts enabled) to fail on valid code when trying to vectorise it?
If so then this needs to be a FIXME instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just moves the existing code and allows R to be a nullptr, so I don't think this should introduce a new assertion failure.

return 1;
}

std::optional<VPValue *>
vputils::getRecipesForUncountableExit(VPlan &Plan,
SmallVectorImpl<VPRecipeBase *> &Recipes,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
/// exist.
VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);

/// Get the VF scaling factor applied to the recipe's output, if the recipe has
/// one.
unsigned getVFScaleFactor(VPRecipeBase *R);

/// Returns the VPValue representing the uncountable exit comparison used by
/// AnyOf if the recipes it depends on can be traced back to live-ins and
/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1381,8 +1381,8 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
}

define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-LABEL: define i32 @red_extended_add_chain(
define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
Expand All @@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should probably be renamed now that it doesn't produce a partial reduction.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, updated to red_extended_add_incomplete_chain

; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
Expand All @@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @red_extended_add_chain(
; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
Expand Down Expand Up @@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain(
; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
Expand All @@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
Expand Down
Loading