Skip to content

Commit

Permalink
[UnrollAndJam] New Unroll and Jam pass
Browse files Browse the repository at this point in the history
This is a simple implementation of the unroll-and-jam classical loop
optimisation.

The basic idea is that we take an outer loop of the form:

  for i..
    ForeBlocks(i)
    for j..
      SubLoopBlocks(i, j)
    AftBlocks(i)

Instead of doing normal inner or outer unrolling, we unroll as follows:

  for i... i+=2
    ForeBlocks(i)
    ForeBlocks(i+1)
    for j..
      SubLoopBlocks(i, j)
      SubLoopBlocks(i+1, j)
    AftBlocks(i)
    AftBlocks(i+1)
  Remainder Loop

So we have unrolled the outer loop, then jammed the two inner loops into
one. This can lead to a simpler inner loop if memory accesses can be shared
between the now jammed loops.

To do this we have to prove that this is all safe, both for the memory
accesses (using dependence analysis) and that ForeBlocks(i+1) can move before
AftBlocks(i) and SubLoopBlocks(i, j).

Differential Revision: https://reviews.llvm.org/D41953

llvm-svn: 336062
  • Loading branch information
davemgreen committed Jul 1, 2018
1 parent 8dabda7 commit 963401d
Show file tree
Hide file tree
Showing 23 changed files with 3,849 additions and 20 deletions.
3 changes: 3 additions & 0 deletions llvm/include/llvm-c/Transforms/Scalar.h
Expand Up @@ -89,6 +89,9 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM);
/** See llvm::createLoopUnrollPass function. */
void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);

/** See llvm::createLoopUnrollAndJamPass function. */
void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);

/** See llvm::createLoopUnswitchPass function. */
void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);

Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Expand Up @@ -422,6 +422,13 @@ class TargetTransformInfo {
bool AllowPeeling;
/// Allow unrolling of all the iterations of the runtime loop remainder.
bool UnrollRemainder;
/// Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollAndJam;
/// Threshold for unroll and jam, for inner loop size. The 'Threshold'
/// value above is used during unroll and jam for the outer loop size.
/// This value is used in the same manner to limit the size of the inner
/// loop.
unsigned UnrollAndJamInnerLoopThreshold;
};

/// Get target-customized preferences for the generic loop unrolling
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/InitializePasses.h
Expand Up @@ -226,6 +226,7 @@ void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
void initializeLoopSimplifyPass(PassRegistry&);
void initializeLoopStrengthReducePass(PassRegistry&);
void initializeLoopUnrollPass(PassRegistry&);
void initializeLoopUnrollAndJamPass(PassRegistry&);
void initializeLoopUnswitchPass(PassRegistry&);
void initializeLoopVectorizePass(PassRegistry&);
void initializeLoopVersioningLICMPass(PassRegistry&);
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/LinkAllPasses.h
Expand Up @@ -132,6 +132,7 @@ namespace {
(void) llvm::createLoopStrengthReducePass();
(void) llvm::createLoopRerollPass();
(void) llvm::createLoopUnrollPass();
(void) llvm::createLoopUnrollAndJamPass();
(void) llvm::createLoopUnswitchPass();
(void) llvm::createLoopVersioningLICMPass();
(void) llvm::createLoopIdiomPass();
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/Transforms/Scalar.h
Expand Up @@ -190,6 +190,12 @@ Pass *createLoopUnrollPass(int OptLevel = 2, int Threshold = -1, int Count = -1,
// Create an unrolling pass for full unrolling that uses exact trip count only.
Pass *createSimpleLoopUnrollPass(int OptLevel = 2);

//===----------------------------------------------------------------------===//
//
// LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
//
Pass *createLoopUnrollAndJamPass(int OptLevel = 2);

//===----------------------------------------------------------------------===//
//
// LoopReroll - This pass is a simple loop rerolling pass.
Expand Down
35 changes: 35 additions & 0 deletions llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
@@ -0,0 +1,35 @@
//===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
#define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H

#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/PassManager.h"

namespace llvm {

class Loop;
struct LoopStandardAnalysisResults;
class LPMUpdater;

/// A simple loop rotation transformation.
class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
const int OptLevel;

public:
explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};

} // end namespace llvm

#endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
41 changes: 41 additions & 0 deletions llvm/include/llvm/Transforms/Utils/UnrollLoop.h
Expand Up @@ -19,11 +19,13 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/ValueMapper.h"

namespace llvm {

class AssumptionCache;
class BasicBlock;
class DependenceInfo;
class DominatorTree;
class Loop;
class LoopInfo;
Expand Down Expand Up @@ -78,8 +80,47 @@ bool canPeel(Loop *L);
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);

LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
unsigned TripMultiple, bool UnrollRemainder,
LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE);

bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
DependenceInfo &DI);

bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const SmallPtrSetImpl<const Value *> &EphValues,
OptimizationRemarkEmitter *ORE, unsigned &TripCount,
unsigned MaxTripCount, unsigned &TripMultiple,
unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP,
bool &UseUpperBound);

BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT);

void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);

void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC);

MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);

TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);

unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues,
unsigned BEInsns);

} // end namespace llvm

#endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
10 changes: 10 additions & 0 deletions llvm/lib/Passes/PassBuilder.cpp
Expand Up @@ -121,6 +121,7 @@
#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
#include "llvm/Transforms/Scalar/LoopSink.h"
#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
#include "llvm/Transforms/Scalar/LowerAtomic.h"
#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
Expand Down Expand Up @@ -179,6 +180,10 @@ static cl::opt<bool> EnableGVNSink(
"enable-npm-gvn-sink", cl::init(false), cl::Hidden,
cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));

static cl::opt<bool> EnableUnrollAndJam(
"enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));

static cl::opt<bool> EnableSyntheticCounts(
"enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
cl::desc("Run synthetic function entry count generation "
Expand Down Expand Up @@ -798,6 +803,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam) {
OptimizePM.addPass(
createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
}
OptimizePM.addPass(LoopUnrollPass(Level));
OptimizePM.addPass(InstCombinePass());
OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Passes/PassRegistry.def
Expand Up @@ -241,6 +241,7 @@ LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
LOOP_PASS("strength-reduce", LoopStrengthReducePass())
LOOP_PASS("indvars", IndVarSimplifyPass())
LOOP_PASS("irce", IRCEPass())
LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
LOOP_PASS("unroll-full", LoopFullUnrollPass())
LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Expand Up @@ -622,6 +622,8 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Runtime = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = 4;
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;

// Force unrolling small loops can be very useful because of the branch
// taken cost of the backedge.
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
Expand Up @@ -96,6 +96,10 @@ static cl::opt<bool> EnableLoopInterchange(
"enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the new, experimental LoopInterchange Pass"));

static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
cl::init(false), cl::Hidden,
cl::desc("Enable Unroll And Jam Pass"));

static cl::opt<bool>
EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
cl::desc("Enable preparation for ThinLTO."));
Expand Down Expand Up @@ -669,6 +673,13 @@ void PassManagerBuilder::populateModulePassManager(
addInstructionCombiningPass(MPM);

if (!DisableUnrollLoops) {
if (EnableUnrollAndJam) {
// Unroll and Jam. We do this before unroll but need to be in a separate
// loop pass manager in order for the outer loop to be processed by
// unroll and jam before the inner loop is unrolled.
MPM.add(createLoopUnrollAndJamPass(OptLevel));
}

MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops

// LoopUnroll may generate some redundency to cleanup.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Scalar/CMakeLists.txt
Expand Up @@ -39,6 +39,7 @@ add_llvm_library(LLVMScalarOpts
LoopSimplifyCFG.cpp
LoopStrengthReduce.cpp
LoopUnrollPass.cpp
LoopUnrollAndJamPass.cpp
LoopUnswitch.cpp
LoopVersioningLICM.cpp
LowerAtomic.cpp
Expand Down

0 comments on commit 963401d

Please sign in to comment.