Skip to content

Commit

Permalink
Support generic expansion of ordered vector reduction (PR36732)
Browse files Browse the repository at this point in the history
Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order.

This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1]

Differential Revision: https://reviews.llvm.org/D45366

llvm-svn: 329585
  • Loading branch information
RKSimon committed Apr 9, 2018
1 parent bec8a66 commit 23c2182
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 14 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/Transforms/Utils/LoopUtils.h
Expand Up @@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE = nullptr);

/// Generates an ordered vector reduction using extracts to reduce the value.
Value *
getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
RecurrenceDescriptor::MRK_Invalid,
ArrayRef<Value *> RedOps = None);

/// Generates a vector reduction using shufflevectors to reduce the value.
Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind
Expand Down
15 changes: 9 additions & 6 deletions llvm/lib/CodeGen/ExpandReductions.cpp
Expand Up @@ -78,25 +78,26 @@ RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {

bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
bool Changed = false;
SmallVector<IntrinsicInst*, 4> Worklist;
SmallVector<IntrinsicInst *, 4> Worklist;
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
if (auto II = dyn_cast<IntrinsicInst>(&*I))
Worklist.push_back(II);

for (auto *II : Worklist) {
IRBuilder<> Builder(II);
bool IsOrdered = false;
Value *Acc = nullptr;
Value *Vec = nullptr;
auto ID = II->getIntrinsicID();
auto MRK = RecurrenceDescriptor::MRK_Invalid;
switch (ID) {
case Intrinsic::experimental_vector_reduce_fadd:
case Intrinsic::experimental_vector_reduce_fmul:
// FMFs must be attached to the call, otherwise it's an ordered reduction
// and it can't be handled by generating this shuffle sequence.
// TODO: Implement scalarization of ordered reductions here for targets
// without native support.
// and it can't be handled by generating a shuffle sequence.
if (!II->getFastMathFlags().isFast())
continue;
IsOrdered = true;
Acc = II->getArgOperand(0);
Vec = II->getArgOperand(1);
break;
case Intrinsic::experimental_vector_reduce_add:
Expand All @@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
}
if (!TTI->shouldExpandReduction(II))
continue;
auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
Value *Rdx =
IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
: getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
II->replaceAllUsesWith(Rdx);
II->eraseFromParent();
Changed = true;
Expand Down
32 changes: 32 additions & 0 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Expand Up @@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V) {
return V;
}

// Helper to generate an ordered reduction.
Value *
llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
ArrayRef<Value *> RedOps) {
unsigned VF = Src->getType()->getVectorNumElements();

// Extract and apply reduction ops in ascending order:
// e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
Value *Result = Acc;
for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
Value *Ext =
Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));

if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
"bin.rdx");
} else {
assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
"Invalid min/max");
Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
Ext);
}

if (!RedOps.empty())
propagateIRFlags(Result, RedOps);
}

return Result;
}

// Helper to generate a log2 shuffle reduction.
Value *
llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
Expand Down
44 changes: 36 additions & 8 deletions llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
Expand Up @@ -117,8 +117,15 @@ entry:
define float @fadd_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
Expand All @@ -128,8 +135,15 @@ entry:
define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict_accum(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
Expand Down Expand Up @@ -169,8 +183,15 @@ entry:
define float @fmul_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
Expand All @@ -180,8 +201,15 @@ entry:
define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict_accum(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
Expand Down

0 comments on commit 23c2182

Please sign in to comment.