From 23c2182c2bfcd145b2c33cf5ca831a7348e6adce Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 9 Apr 2018 15:44:20 +0000 Subject: [PATCH] Support generic expansion of ordered vector reduction (PR36732) Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order. This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1] Differential Revision: https://reviews.llvm.org/D45366 llvm-svn: 329585 --- .../include/llvm/Transforms/Utils/LoopUtils.h | 7 +++ llvm/lib/CodeGen/ExpandReductions.cpp | 15 ++++--- llvm/lib/Transforms/Utils/LoopUtils.cpp | 32 ++++++++++++++ .../Generic/expand-experimental-reductions.ll | 44 +++++++++++++++---- 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 6e854ed24a7eb..3a0e804e0cce2 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE = nullptr); +/// Generates an ordered vector reduction using extracts to reduce the value. +Value * +getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = None); + /// Generates a vector reduction using shufflevectors to reduce the value. Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index abf487a4f198c..7552ba8cd85db 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -78,13 +78,15 @@ RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { bool expandReductions(Function &F, const TargetTransformInfo *TTI) { bool Changed = false; - SmallVector Worklist; + SmallVector Worklist; for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (auto II = dyn_cast(&*I)) Worklist.push_back(II); for (auto *II : Worklist) { IRBuilder<> Builder(II); + bool IsOrdered = false; + Value *Acc = nullptr; Value *Vec = nullptr; auto ID = II->getIntrinsicID(); auto MRK = RecurrenceDescriptor::MRK_Invalid; @@ -92,11 +94,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { case Intrinsic::experimental_vector_reduce_fadd: case Intrinsic::experimental_vector_reduce_fmul: // FMFs must be attached to the call, otherwise it's an ordered reduction - // and it can't be handled by generating this shuffle sequence. - // TODO: Implement scalarization of ordered reductions here for targets - // without native support. + // and it can't be handled by generating a shuffle sequence. if (!II->getFastMathFlags().isFast()) - continue; + IsOrdered = true; + Acc = II->getArgOperand(0); Vec = II->getArgOperand(1); break; case Intrinsic::experimental_vector_reduce_add: @@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { } if (!TTI->shouldExpandReduction(II)) continue; - auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + Value *Rdx = + IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK) + : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); II->replaceAllUsesWith(Rdx); II->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 67e209583b749..805a003f18f8a 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V) { return V; } +// Helper to generate an ordered reduction. +Value * +llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, + unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { + unsigned VF = Src->getType()->getVectorNumElements(); + + // Extract and apply reduction ops in ascending order: + // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1] + Value *Result = Acc; + for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) { + Value *Ext = + Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx)); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext, + "bin.rdx"); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result, + Ext); + } + + if (!RedOps.empty()) + propagateIRFlags(Result, RedOps); + } + + return Result; +} + // Helper to generate a log2 shuffle reduction. Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll index 472e66ce1ddf2..05fa6e324ac00 100644 --- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -117,8 +117,15 @@ entry: define float @fadd_f32_strict(<4 x float> %vec) { ; CHECK-LABEL: @fadd_f32_strict( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) -; CHECK-NEXT: ret float [[R]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) @@ -128,8 +135,15 @@ entry: define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) { ; CHECK-LABEL: @fadd_f32_strict_accum( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) -; CHECK-NEXT: ret float [[R]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -169,8 +183,15 @@ entry: define float @fmul_f32_strict(<4 x float> %vec) { ; CHECK-LABEL: @fmul_f32_strict( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) -; CHECK-NEXT: ret float [[R]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) @@ -180,8 +201,15 @@ entry: define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) { ; CHECK-LABEL: @fmul_f32_strict_accum( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) -; CHECK-NEXT: ret float [[R]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)