diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst index 1dbbc11f62cb6..3d550e21d690a 100644 --- a/polly/docs/ReleaseNotes.rst +++ b/polly/docs/ReleaseNotes.rst @@ -11,3 +11,15 @@ In Polly 14 the following important changes have been incorporated. branch. - Change ... + + * The command line option -polly-opt-fusion has been removed. What the + flag does was frequently misunderstood and is rarely useful. However, + the functionality is still accessible using +``` + -polly-isl-arg=--no-schedule-serialize-sccs +``` + + * The command line option -polly-loopfusion-greedy has been added. + This will agressively try to fuse any loop regardless of + profitability. The is what users might have expected what + -polly-opt-fusion=max would do. diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h index e8685313c83c4..993245cfce481 100644 --- a/polly/include/polly/ScheduleTreeTransform.h +++ b/polly/include/polly/ScheduleTreeTransform.h @@ -240,6 +240,14 @@ isl::schedule_node applyRegisterTiling(isl::schedule_node Node, llvm::ArrayRef TileSizes, int DefaultTileSize); +/// Apply greedy fusion. That is, fuse any loop that is possible to be fused +/// top-down. +/// +/// @param Sched Sched tree to fuse all the loops in. +/// @param Deps Validity constraints that must be preserved. +isl::schedule applyGreedyFusion(isl::schedule Sched, + const isl::union_map &Deps); + } // namespace polly #endif // POLLY_SCHEDULETREETRANSFORM_H diff --git a/polly/include/polly/Support/GICHelper.h b/polly/include/polly/Support/GICHelper.h index 5c6a3256c1c36..5a106d7ae0d87 100644 --- a/polly/include/polly/Support/GICHelper.h +++ b/polly/include/polly/Support/GICHelper.h @@ -231,6 +231,12 @@ ISL_DUMP_OBJECT(val) ISL_DUMP_OBJECT(val_list) //@} +/// Emit the equivaltent of the isl_*_dump output into a raw_ostream. +/// @{ +void dumpIslObj(const isl::schedule_node &Node, llvm::raw_ostream &OS); +void dumpIslObj(__isl_keep isl_schedule_node *node, llvm::raw_ostream &OS); +/// @} + inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, __isl_keep isl_union_map *Map) { OS << polly::stringFromIslObj(Map, "null"); diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp index 409dbf4766dd9..359b170502310 100644 --- a/polly/lib/Support/GICHelper.cpp +++ b/polly/lib/Support/GICHelper.cpp @@ -238,4 +238,25 @@ void neverCalled() { polly::dumpIslObj(isl::val()); polly::dumpIslObj(isl::val_list()); } + +void polly::dumpIslObj(__isl_keep isl_schedule_node *node, raw_ostream &OS) { + if (!node) + return; + + isl_ctx *ctx = isl_schedule_node_get_ctx(node); + isl_printer *p = isl_printer_to_str(ctx); + p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); + p = isl_printer_print_schedule_node(p, node); + + char *char_str = isl_printer_get_str(p); + OS << char_str; + + free(char_str); + isl_printer_free(p); +} + +void polly::dumpIslObj(const isl::schedule_node &Node, raw_ostream &OS) { + dumpIslObj(Node.get(), OS); +} + #endif diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 932f69ef7da6c..02d468577387b 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -97,6 +97,11 @@ static cl::opt cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt + GreedyFusion("polly-loopfusion-greedy", + cl::desc("Aggressively try to fuse everything"), cl::Hidden, + cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::opt OuterCoincidence( "polly-opt-outer-coincidence", cl::desc("Try to construct schedules where the outer member of each band " @@ -835,6 +840,13 @@ static bool runIslScheduleOptimizer( if (Schedule.is_null()) return false; + if (GreedyFusion) { + isl::union_map Validity = D.getDependences( + Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); + Schedule = applyGreedyFusion(Schedule, Validity); + assert(!Schedule.is_null()); + } + // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. const OptimizerAdditionalInfoTy OAI = { TTI, const_cast(&D), diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp index ce4e6ae9614db..08b377c707acd 100644 --- a/polly/lib/Transform/ScheduleTreeTransform.cpp +++ b/polly/lib/Transform/ScheduleTreeTransform.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "polly/ScheduleTreeTransform.h" +#include "polly/Support/GICHelper.h" #include "polly/Support/ISLTools.h" #include "polly/Support/ScopHelper.h" #include "llvm/ADT/ArrayRef.h" @@ -20,10 +21,103 @@ #include "llvm/IR/Metadata.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#define DEBUG_TYPE "polly-opt-isl" + using namespace polly; using namespace llvm; namespace { + +/// Copy the band member attributes (coincidence, loop type, isolate ast loop +/// type) from one band to another. +static isl::schedule_node_band +applyBandMemberAttributes(isl::schedule_node_band Target, int TargetIdx, + const isl::schedule_node_band &Source, + int SourceIdx) { + bool Coincident = Source.member_get_coincident(SourceIdx).release(); + Target = Target.member_set_coincident(TargetIdx, Coincident); + + isl_ast_loop_type LoopType = + isl_schedule_node_band_member_get_ast_loop_type(Source.get(), SourceIdx); + Target = isl::manage(isl_schedule_node_band_member_set_ast_loop_type( + Target.release(), TargetIdx, LoopType)) + .as(); + + isl_ast_loop_type IsolateType = + isl_schedule_node_band_member_get_isolate_ast_loop_type(Source.get(), + SourceIdx); + Target = isl::manage(isl_schedule_node_band_member_set_isolate_ast_loop_type( + Target.release(), TargetIdx, IsolateType)) + .as(); + + return Target; +} + +/// Create a new band by copying members from another @p Band. @p IncludeCb +/// decides which band indices are copied to the result. +template +static isl::schedule rebuildBand(isl::schedule_node_band OldBand, + isl::schedule Body, CbTy IncludeCb) { + int NumBandDims = OldBand.n_member().release(); + + bool ExcludeAny = false; + bool IncludeAny = false; + for (auto OldIdx : seq(0, NumBandDims)) { + if (IncludeCb(OldIdx)) + IncludeAny = true; + else + ExcludeAny = true; + } + + // Instead of creating a zero-member band, don't create a band at all. + if (!IncludeAny) + return Body; + + isl::multi_union_pw_aff PartialSched = OldBand.get_partial_schedule(); + isl::multi_union_pw_aff NewPartialSched; + if (ExcludeAny) { + // Select the included partial scatter functions. + isl::union_pw_aff_list List = PartialSched.list(); + int NewIdx = 0; + for (auto OldIdx : seq(0, NumBandDims)) { + if (IncludeCb(OldIdx)) + NewIdx += 1; + else + List = List.drop(NewIdx, 1); + } + isl::space ParamSpace = PartialSched.get_space().params(); + isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(NewIdx); + NewPartialSched = isl::multi_union_pw_aff(NewScatterSpace, List); + } else { + // Just reuse original scatter function of copying all of them. + NewPartialSched = PartialSched; + } + + // Create the new band node. + isl::schedule_node_band NewBand = + Body.insert_partial_schedule(NewPartialSched) + .get_root() + .child(0) + .as(); + + // If OldBand was permutable, so is the new one, even if some dimensions are + // missing. + bool IsPermutable = OldBand.permutable().release(); + NewBand = NewBand.set_permutable(IsPermutable); + + // Reapply member attributes. + int NewIdx = 0; + for (auto OldIdx : seq(0, NumBandDims)) { + if (!IncludeCb(OldIdx)) + continue; + NewBand = + applyBandMemberAttributes(std::move(NewBand), NewIdx, OldBand, OldIdx); + NewIdx += 1; + } + + return NewBand.get_schedule(); +} + /// Recursively visit all nodes of a schedule tree while allowing changes. /// /// The visit methods return an isl::schedule_node that is used to continue @@ -75,23 +169,9 @@ struct ScheduleTreeRewriter } isl::schedule visitBand(isl::schedule_node_band Band, Args... args) { - isl::multi_union_pw_aff PartialSched = - isl::manage(isl_schedule_node_band_get_partial_schedule(Band.get())); isl::schedule NewChild = getDerived().visit(Band.child(0), std::forward(args)...); - isl::schedule_node NewNode = - NewChild.insert_partial_schedule(PartialSched).get_root().child(0); - - // Reapply permutability and coincidence attributes. - NewNode = isl::manage(isl_schedule_node_band_set_permutable( - NewNode.release(), isl_schedule_node_band_get_permutable(Band.get()))); - unsigned BandDims = isl_schedule_node_band_n_member(Band.get()); - for (unsigned i = 0; i < BandDims; i += 1) - NewNode = isl::manage(isl_schedule_node_band_member_set_coincident( - NewNode.release(), i, - isl_schedule_node_band_member_get_coincident(Band.get(), i))); - - return NewNode.get_schedule(); + return rebuildBand(Band, NewChild, [](int) { return true; }); } isl::schedule visitSequence(isl::schedule_node_sequence Sequence, @@ -159,6 +239,10 @@ struct ScheduleTreeRewriter } }; +/// Rewrite the schedule tree without any changes. Useful to copy a subtree into +/// a new schedule, discarding everything but. +struct IdentityRewriter : public ScheduleTreeRewriter {}; + /// Rewrite a schedule tree to an equivalent one without extension nodes. /// /// Each visit method takes two additional arguments: @@ -265,11 +349,9 @@ struct ExtensionNodeRewriter NewNode = isl::manage(isl_schedule_node_band_set_permutable( NewNode.release(), isl_schedule_node_band_get_permutable(OldNode.get()))); - for (unsigned i = 0; i < BandDims; i += 1) { - NewNode = isl::manage(isl_schedule_node_band_member_set_coincident( - NewNode.release(), i, - isl_schedule_node_band_member_get_coincident(OldNode.get(), i))); - } + for (unsigned i = 0; i < BandDims; i += 1) + NewNode = applyBandMemberAttributes(NewNode.as(), + i, OldNode, i); return NewNode.get_schedule(); } @@ -504,6 +586,404 @@ static isl::set addExtentConstraints(isl::set Set, int VectorWidth) { ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1); return Set.add_constraint(ExtConstr); } + +/// Collapse perfectly nested bands into a single band. +class BandCollapseRewriter : public ScheduleTreeRewriter { +private: + using BaseTy = ScheduleTreeRewriter; + BaseTy &getBase() { return *this; } + const BaseTy &getBase() const { return *this; } + +public: + isl::schedule visitBand(isl::schedule_node_band RootBand) { + isl::schedule_node_band Band = RootBand; + isl::ctx Ctx = Band.ctx(); + + // Do not merge permutable band to avoid loosing the permutability property. + // Cannot collapse even two permutable loops, they might be permutable + // individually, but not necassarily accross. + if (Band.n_member().release() > 1 && Band.permutable()) + return getBase().visitBand(Band); + + // Find collapsable bands. + SmallVector Nest; + int NumTotalLoops = 0; + isl::schedule_node Body; + while (true) { + Nest.push_back(Band); + NumTotalLoops += Band.n_member().release(); + Body = Band.first_child(); + if (!Body.isa()) + break; + Band = Body.as(); + + // Do not include next band if it is permutable to not lose its + // permutability property. + if (Band.n_member().release() > 1 && Band.permutable()) + break; + } + + // Nothing to collapse, preserve permutability. + if (Nest.size() <= 1) + return getBase().visitBand(Band); + + LLVM_DEBUG({ + dbgs() << "Found loops to collapse between\n"; + dumpIslObj(RootBand, dbgs()); + dbgs() << "and\n"; + dumpIslObj(Body, dbgs()); + dbgs() << "\n"; + }); + + isl::schedule NewBody = visit(Body); + + // Collect partial schedules from all members. + isl::union_pw_aff_list PartScheds{Ctx, NumTotalLoops}; + for (isl::schedule_node_band Band : Nest) { + int NumLoops = Band.n_member().release(); + isl::multi_union_pw_aff BandScheds = Band.get_partial_schedule(); + for (auto j : seq(0, NumLoops)) + PartScheds = PartScheds.add(BandScheds.at(j)); + } + isl::space ScatterSpace = isl::space(Ctx, 0, NumTotalLoops); + isl::multi_union_pw_aff PartSchedsMulti{ScatterSpace, PartScheds}; + + isl::schedule_node_band CollapsedBand = + NewBody.insert_partial_schedule(PartSchedsMulti) + .get_root() + .first_child() + .as(); + + // Copy over loop attributes form original bands. + int LoopIdx = 0; + for (isl::schedule_node_band Band : Nest) { + int NumLoops = Band.n_member().release(); + for (int i : seq(0, NumLoops)) { + CollapsedBand = applyBandMemberAttributes(std::move(CollapsedBand), + LoopIdx, Band, i); + LoopIdx += 1; + } + } + assert(LoopIdx == NumTotalLoops && + "Expect the same number of loops to add up again"); + + return CollapsedBand.get_schedule(); + } +}; + +static isl::schedule collapseBands(isl::schedule Sched) { + LLVM_DEBUG(dbgs() << "Collapse bands in schedule\n"); + BandCollapseRewriter Rewriter; + return Rewriter.visit(Sched); +} + +/// Collect sequentially executed bands (or anything else), even if nested in a +/// mark or other nodes whose child is executed just once. If we can +/// successfully fuse the bands, we allow them to be removed. +static void collectPotentiallyFusableBands( + isl::schedule_node Node, + SmallVectorImpl> + &ScheduleBands, + const isl::schedule_node &DirectChild) { + switch (isl_schedule_node_get_type(Node.get())) { + case isl_schedule_node_sequence: + case isl_schedule_node_set: + case isl_schedule_node_mark: + case isl_schedule_node_domain: + case isl_schedule_node_filter: + if (Node.has_children()) { + isl::schedule_node C = Node.first_child(); + while (true) { + collectPotentiallyFusableBands(C, ScheduleBands, DirectChild); + if (!C.has_next_sibling()) + break; + C = C.next_sibling(); + } + } + break; + + default: + // Something that does not execute suquentially (e.g. a band) + ScheduleBands.push_back({Node, DirectChild}); + break; + } +} + +/// Remove dependencies that are resolved by @p PartSched. That is, remove +/// everything that we already know is executed in-order. +static isl::union_map remainingDepsFromPartialSchedule(isl::union_map PartSched, + isl::union_map Deps) { + int NumDims = getNumScatterDims(PartSched); + auto ParamSpace = PartSched.get_space().params(); + + // { Scatter[] } + isl::space ScatterSpace = + ParamSpace.set_from_params().add_dims(isl::dim::set, NumDims); + + // { Scatter[] -> Domain[] } + isl::union_map PartSchedRev = PartSched.reverse(); + + // { Scatter[] -> Scatter[] } + isl::map MaybeBefore = isl::map::lex_le(ScatterSpace); + + // { Domain[] -> Domain[] } + isl::union_map DomMaybeBefore = + MaybeBefore.apply_domain(PartSchedRev).apply_range(PartSchedRev); + + // { Domain[] -> Domain[] } + isl::union_map ChildRemainingDeps = Deps.intersect(DomMaybeBefore); + + return ChildRemainingDeps; +} + +/// Remove dependencies that are resolved by executing them in the order +/// specified by @p Domains; +static isl::union_map remainigDepsFromSequence(ArrayRef Domains, + isl::union_map Deps) { + isl::ctx Ctx = Deps.ctx(); + isl::space ParamSpace = Deps.get_space().params(); + + // Create a partial schedule mapping to constants that reflect the execution + // order. + isl::union_map PartialSchedules = isl::union_map::empty(Ctx); + for (auto P : enumerate(Domains)) { + isl::val ExecTime = isl::val(Ctx, P.index()); + isl::union_pw_aff DomSched{P.value(), ExecTime}; + PartialSchedules = PartialSchedules.unite(DomSched.as_union_map()); + } + + return remainingDepsFromPartialSchedule(PartialSchedules, Deps); +} + +/// Determine whether the outermost loop of to bands can be fused while +/// respecting validity dependencies. +static bool canFuseOutermost(const isl::schedule_node_band &LHS, + const isl::schedule_node_band &RHS, + const isl::union_map &Deps) { + // { LHSDomain[] -> Scatter[] } + isl::union_map LHSPartSched = + LHS.get_partial_schedule().get_at(0).as_union_map(); + + // { Domain[] -> Scatter[] } + isl::union_map RHSPartSched = + RHS.get_partial_schedule().get_at(0).as_union_map(); + + // Dependencies that are already resolved because LHS executes before RHS, but + // will not be anymore after fusion. { DefDomain[] -> UseDomain[] } + isl::union_map OrderedBySequence = + Deps.intersect_domain(LHSPartSched.domain()) + .intersect_range(RHSPartSched.domain()); + + isl::space ParamSpace = OrderedBySequence.get_space().params(); + isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(1); + + // { Scatter[] -> Scatter[] } + isl::map After = isl::map::lex_gt(NewScatterSpace); + + // After fusion, instances with smaller (or equal, which means they will be + // executed in the same iteration, but the LHS instance is still sequenced + // before RHS) scatter value will still be executed before. This are the + // orderings where this is not necessarily the case. + // { LHSDomain[] -> RHSDomain[] } + isl::union_map MightBeAfterDoms = After.apply_domain(LHSPartSched.reverse()) + .apply_range(RHSPartSched.reverse()); + + // Dependencies that are not resolved by the new execution order. + isl::union_map WithBefore = OrderedBySequence.intersect(MightBeAfterDoms); + + return WithBefore.is_empty(); +} + +/// Fuse @p LHS and @p RHS if possible while preserving validity dependenvies. +static isl::schedule tryGreedyFuse(isl::schedule_node_band LHS, + isl::schedule_node_band RHS, + const isl::union_map &Deps) { + if (!canFuseOutermost(LHS, RHS, Deps)) + return {}; + + LLVM_DEBUG({ + dbgs() << "Found loops for greedy fusion:\n"; + dumpIslObj(LHS, dbgs()); + dbgs() << "and\n"; + dumpIslObj(RHS, dbgs()); + dbgs() << "\n"; + }); + + // The partial schedule of the bands outermost loop that we need to combine + // for the fusion. + isl::union_pw_aff LHSPartOuterSched = LHS.get_partial_schedule().get_at(0); + isl::union_pw_aff RHSPartOuterSched = RHS.get_partial_schedule().get_at(0); + + // Isolate band bodies as roots of their own schedule trees. + IdentityRewriter Rewriter; + isl::schedule LHSBody = Rewriter.visit(LHS.first_child()); + isl::schedule RHSBody = Rewriter.visit(RHS.first_child()); + + // Reconstruct the non-outermost (not going to be fused) loops from both + // bands. + // TODO: Maybe it is possibly to transfer the 'permutability' property from + // LHS+RHS. At minimum we need merge multiple band members at once, otherwise + // permutability has no meaning. + isl::schedule LHSNewBody = + rebuildBand(LHS, LHSBody, [](int i) { return i > 0; }); + isl::schedule RHSNewBody = + rebuildBand(RHS, RHSBody, [](int i) { return i > 0; }); + + // The loop body of the fused loop. + isl::schedule NewCommonBody = LHSNewBody.sequence(RHSNewBody); + + // Combine the partial schedules of both loops to a new one. Instances with + // the same scatter value are put together. + isl::union_map NewCommonPartialSched = + LHSPartOuterSched.as_union_map().unite(RHSPartOuterSched.as_union_map()); + isl::schedule NewCommonSchedule = NewCommonBody.insert_partial_schedule( + NewCommonPartialSched.as_multi_union_pw_aff()); + + return NewCommonSchedule; +} + +static isl::schedule tryGreedyFuse(isl::schedule_node LHS, + isl::schedule_node RHS, + const isl::union_map &Deps) { + // TODO: Non-bands could be interpreted as a band with just as single + // iteration. However, this is only useful if both ends of a fused loop were + // originally loops themselves. + if (!LHS.isa()) + return {}; + if (!RHS.isa()) + return {}; + + return tryGreedyFuse(LHS.as(), + RHS.as(), Deps); +} + +/// Fuse all fusable loop top-down in a schedule tree. +/// +/// The isl::union_map parameters is the set of validity dependencies that have +/// not been resolved/carried by a parent schedule node. +class GreedyFusionRewriter + : public ScheduleTreeRewriter { +private: + using BaseTy = ScheduleTreeRewriter; + BaseTy &getBase() { return *this; } + const BaseTy &getBase() const { return *this; } + +public: + /// Is set to true if anything has been fused. + bool AnyChange = false; + + isl::schedule visitBand(isl::schedule_node_band Band, isl::union_map Deps) { + int NumLoops = Band.n_member().release(); + + // { Domain[] -> Scatter[] } + isl::union_map PartSched = + isl::union_map::from(Band.get_partial_schedule()); + assert(getNumScatterDims(PartSched) == NumLoops); + isl::space ParamSpace = PartSched.get_space().params(); + + // { Scatter[] -> Domain[] } + isl::union_map PartSchedRev = PartSched.reverse(); + + // Possible within the same iteration. Dependencies with smaller scatter + // value are carried by this loop and therefore have been resolved by the + // in-order execution if the loop iteration. A dependency with small scatter + // value would be a dependency violation that we assume did not happen. { + // Domain[] -> Domain[] } + isl::union_map Unsequenced = PartSchedRev.apply_domain(PartSchedRev); + + // Actual dependencies within the same iteration. + // { DefDomain[] -> UseDomain[] } + isl::union_map RemDeps = Deps.intersect(Unsequenced); + + return getBase().visitBand(Band, RemDeps); + } + + isl::schedule visitSequence(isl::schedule_node_sequence Sequence, + isl::union_map Deps) { + int NumChildren = isl_schedule_node_n_children(Sequence.get()); + + // List of fusion candidates. The first element is the fusion candidate, the + // second is candidate's ancestor that is the sequence's direct child. It is + // preferable to use the direct child if not if its non-direct children is + // fused to preserve its structure such as mark nodes. + SmallVector> Bands; + for (auto i : seq(0, NumChildren)) { + isl::schedule_node Child = Sequence.child(i); + collectPotentiallyFusableBands(Child, Bands, Child); + } + + // Direct children that had at least one of its decendants fused. + SmallDenseSet ChangedDirectChildren; + + // Fuse neigboring bands until reaching the end of candidates. + int i = 0; + while (i + 1 < (int)Bands.size()) { + isl::schedule Fused = + tryGreedyFuse(Bands[i].first, Bands[i + 1].first, Deps); + if (Fused.is_null()) { + // Cannot merge this node with the next; look at next pair. + i += 1; + continue; + } + + // Mark the direct children as (partially) fused. + if (!Bands[i].second.is_null()) + ChangedDirectChildren.insert(Bands[i].second.get()); + if (!Bands[i + 1].second.is_null()) + ChangedDirectChildren.insert(Bands[i + 1].second.get()); + + // Collapse the neigbros to a single new candidate that could be fused + // with the next candidate. + Bands[i] = {Fused.get_root(), {}}; + Bands.erase(Bands.begin() + i + 1); + + AnyChange = true; + } + + // By construction equal if done with collectPotentiallyFusableBands's + // output. + SmallVector SubDomains; + SubDomains.reserve(NumChildren); + for (int i = 0; i < NumChildren; i += 1) + SubDomains.push_back(Sequence.child(i).domain()); + auto SubRemainingDeps = remainigDepsFromSequence(SubDomains, Deps); + + // We may iterate over direct children multiple times, be sure to add each + // at most once. + SmallDenseSet AlreadyAdded; + + isl::schedule Result; + for (auto &P : Bands) { + isl::schedule_node MaybeFused = P.first; + isl::schedule_node DirectChild = P.second; + + // If not modified, use the direct child. + if (!DirectChild.is_null() && + !ChangedDirectChildren.count(DirectChild.get())) { + if (AlreadyAdded.count(DirectChild.get())) + continue; + AlreadyAdded.insert(DirectChild.get()); + MaybeFused = DirectChild; + } else { + assert(AnyChange && + "Need changed flag for be consistent with actual change"); + } + + // Top-down recursion: If the outermost loop has been fused, their nested + // bands might be fusable now as well. + isl::schedule InnerFused = visit(MaybeFused, SubRemainingDeps); + + // Reconstruct the sequence, with some of the children fused. + if (Result.is_null()) + Result = InnerFused; + else + Result = Result.sequence(InnerFused); + } + + return Result; + } +}; + } // namespace bool polly::isBandMark(const isl::schedule_node &Node) { @@ -774,3 +1254,19 @@ isl::schedule polly::applyMaxFission(isl::schedule_node BandToFission) { return Fissioned.get_schedule(); } + +isl::schedule polly::applyGreedyFusion(isl::schedule Sched, + const isl::union_map &Deps) { + LLVM_DEBUG(dbgs() << "Greedy loop fusion\n"); + + GreedyFusionRewriter Rewriter; + isl::schedule Result = Rewriter.visit(Sched, Deps); + if (!Rewriter.AnyChange) { + LLVM_DEBUG(dbgs() << "Found nothing to fuse\n"); + return Sched; + } + + // GreedyFusionRewriter due to working loop-by-loop, bands with multiple loops + // may have been split into multiple bands. + return collapseBands(Result); +} diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll new file mode 100644 index 0000000000000..807abe3d4fe17 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll @@ -0,0 +1,78 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s + +define void @func(i32 %n, [1024 x double]* noalias nonnull %A, [1024 x double]* noalias nonnull %B) { +entry: + br label %outer.for1 + +outer.for1: + %k1 = phi i32 [0, %entry], [%k1.inc, %outer.inc1] + %k1.cmp = icmp slt i32 %k1, %n + br i1 %k1.cmp, label %for1, label %outer.exit1 + + for1: + %j1 = phi i32 [0, %outer.for1], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k1, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + + inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + + exit1: + br label %outer.inc1 + +outer.inc1: + %k1.inc = add nuw nsw i32 %k1, 1 + br label %outer.for1 + +outer.exit1: + br label %outer.for2 + +outer.for2: + %k2 = phi i32 [0, %outer.exit1], [%k2.inc, %outer.inc2] + %k2.cmp = icmp slt i32 %k2, %n + br i1 %k2.cmp, label %for2, label %outer.exit2 + + for2: + %j2 = phi i32 [0, %outer.for2], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k2, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + + inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + + exit2: + br label %outer.inc2 + +outer.inc2: + %k2.inc = add nuw nsw i32 %k2, 1 + br label %outer.for2 + +outer.exit2: + br label %return + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }" +; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll new file mode 100644 index 0000000000000..601d38188cf67 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll @@ -0,0 +1,90 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT + +define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) { +entry: + br label %for1 + + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %idx1 = add i32 %j1, %k + %arrayidx1 = getelementptr inbounds double, double* %B, i32 %idx1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1, !llvm.loop !1 + +exit1: + br label %for2 + + +for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + +inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + +exit2: + br label %for3 + + +for3: + %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3] + %j3.cmp = icmp slt i32 %j3, %n + br i1 %j3.cmp, label %body3, label %exit3 + + body3: + %arrayidx3 = getelementptr inbounds double, double* %A, i32 %j3 + store double 84.0, double* %arrayidx3 + br label %inc3 + +inc3: + %j3.inc = add nuw nsw i32 %j3, 1 + br label %for3 + +exit3: + br label %return + + +return: + ret void +} + + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.id", !"Hello World!"} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }" +; CHECK-NEXT: child: +; RAW-NEXT: mark: "Loop with Metadata" +; RAW-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body1[i0] -> [(i0)] }]" +; OPT-NEXT: permutable: 1 +; OPT-NEXT: coincident: [ 1 ] +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body3[i0] }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body3[i0] -> [(i0)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }" +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll new file mode 100644 index 0000000000000..eeb20e610a2ee --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll @@ -0,0 +1,88 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK + +define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) { +entry: + br label %for1 + + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + +exit1: + br label %for2 + + +for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j1 + store double 42.0, double* %arrayidx2 + br label %inc2 + +inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + +exit2: + br label %for3 + + +for3: + %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3] + %j3.cmp = icmp slt i32 %j3, %n + br i1 %j3.cmp, label %body3, label %exit3 + + body3: + %idx3 = add i32 %j3, %k + %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3 + store double 84.0, double* %arrayidx3 + br label %inc3 + +inc3: + %j3.inc = add nuw nsw i32 %j3, 1 + br label %for3, !llvm.loop !1 + +exit3: + br label %return + + +return: + ret void +} + + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.id", !"Hello World!"} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }" +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }" +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }" +; CHECK-NEXT: child: +; RAW-NEXT: mark: "Loop with Metadata" +; RAW-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll new file mode 100644 index 0000000000000..387c77c860987 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll @@ -0,0 +1,69 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT + +define void @func(i32 %n, double* noalias nonnull %A) { +entry: + br label %outer.for + +outer.for: + %k = phi i32 [0, %entry], [%k.inc, %outer.inc] + %k.cmp = icmp slt i32 %k, %n + br i1 %k.cmp, label %for1, label %outer.exit + + for1: + %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + + inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + + exit1: + br label %for2 + + for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + + inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + + exit2: + br label %outer.inc + +outer.inc: + %k.inc = add nuw nsw i32 %k, 1 + br label %outer.for + +outer.exit: + br label %return + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }" +; CHECK-NEXT: child: +; RAW-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]" +; OPT-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }, { Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }]" +; OPT-NEXT: permutable: 1 +; OPT-NEXT: coincident: [ 1, 0 ] +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }" +; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll new file mode 100644 index 0000000000000..00eb041f9e285 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll @@ -0,0 +1,88 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK + +define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) { +entry: + br label %for1 + + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + +exit1: + br label %for2 + + +for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + +inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + +exit2: + br label %for3 + + +for3: + %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3] + %j3.cmp = icmp slt i32 %j3, %n + br i1 %j3.cmp, label %body3, label %exit3 + + body3: + %idx3 = add i32 %j3, %k + %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3 + store double 84.0, double* %arrayidx3 + br label %inc3 + +inc3: + %j3.inc = add nuw nsw i32 %j3, 1 + br label %for3, !llvm.loop !1 + +exit3: + br label %return + + +return: + ret void +} + + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.id", !"Hello World!"} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }" +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }" +; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }" +; CHECK-NEXT: child: +; RAW-NEXT: mark: "Loop with Metadata" +; RAW-NEXT: child: +; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll new file mode 100644 index 0000000000000..ada3934dec4db --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s + +define void @func(i32 %n, [1024 x double]* noalias nonnull %A) { +entry: + br label %outer.for + +outer.for: + %k = phi i32 [0, %entry], [%k.inc, %outer.inc] + %k.cmp = icmp slt i32 %k, %n + br i1 %k.cmp, label %for1, label %outer.exit + + for1: + %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + + inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + + exit1: + br label %for2 + + for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + + inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + + exit2: + br label %outer.inc + +outer.inc: + %k.inc = add nuw nsw i32 %k, 1 + br label %outer.for + +outer.exit: + br label %return + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }" +; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll new file mode 100644 index 0000000000000..ba083ac0b5fa5 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll @@ -0,0 +1,54 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s + +define void @func(i32 %n, double* noalias nonnull %A) { +entry: + br label %for1 + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + +exit1: + br label %for2 + +for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2 + store double 42.0, double* %arrayidx2 + br label %inc2 + +inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + +exit2: + br label %return + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n }" +; CHECK-NEXT: child: +; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]" +; CHECK-NEXT: child: +; CHECK-NEXT: sequence: +; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0] }" +; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0] }" diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll new file mode 100644 index 0000000000000..1509aa9959174 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll @@ -0,0 +1,51 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s + +; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions). + +define void @func(i32 %n, double* noalias nonnull %A, i32 %k) { +entry: + br label %for1 + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + +exit1: + br label %for2 + +for2: + %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2] + %j2.cmp = icmp slt i32 %j2, %n + br i1 %j2.cmp, label %body2, label %exit2 + + body2: + %idx2 = add i32 %j2, %k + %arrayidx2 = getelementptr inbounds double, double* %A, i32 %idx2 + store double 42.0, double* %arrayidx2 + br label %inc2 + +inc2: + %j2.inc = add nuw nsw i32 %j2, 1 + br label %for2 + +exit2: + br label %return + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: n/a diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll new file mode 100644 index 0000000000000..e52b89beef0b9 --- /dev/null +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll @@ -0,0 +1,57 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s + +define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) { +entry: + br label %for1 + + +for1: + %j1 = phi i32 [0, %entry], [%j1.inc, %inc1] + %j1.cmp = icmp slt i32 %j1, %n + br i1 %j1.cmp, label %body1, label %exit1 + + body1: + %idx1 = add i32 %j1, %k + %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1 + store double 21.0, double* %arrayidx1 + br label %inc1 + +inc1: + %j1.inc = add nuw nsw i32 %j1, 1 + br label %for1 + +exit1: + br label %middle2 + + +middle2: + store double 52.0, double* %A + br label %for3 + + +for3: + %j3 = phi i32 [0, %middle2], [%j3.inc, %inc3] + %j3.cmp = icmp slt i32 %j3, %n + br i1 %j3.cmp, label %body3, label %exit3 + + body3: + %arrayidx3 = getelementptr inbounds double, double* %B, i32 %j3 + store double 84.0, double* %arrayidx3 + br label %inc3 + +inc3: + %j3.inc = add nuw nsw i32 %j3, 1 + br label %for3 + +exit3: + br label %return + + +return: + ret void +} + + +; CHECK: Calculated schedule: +; CHECK-NEXT: n/a