Skip to content

Commit

Permalink
[DSE] Remove stores in the same loop iteration
Browse files Browse the repository at this point in the history
DSE will currently only remove stores in the same block unless they can
be guaranteed to be loop invariant. This expands that to any stores that
are in the same Loop, at the same loop level. This should still account
for where AA/MSSA will not handle aliasing between loops, but allow the
dead stores to be removed where they overlap in the same loop iteration.
It requires adding loop info to DSE, but that looks fairly harmless.

The test case this helps is from code like this, which can come up in
certain matrix operations:
  for(i=..)
    dst[i] = 0;
    for(j=..)
      dst[i] += src[i*n+j];

After LICM, this becomes:
for(i=..)
  dst[i] = 0;
  sum = 0;
  for(j=..)
    sum += src[i*n+j];
  dst[i] = sum;

The first store is dead, and with this patch is now removed.

Differntial Revision: https://reviews.llvm.org/D100464
  • Loading branch information
davemgreen committed Jun 20, 2021
1 parent 4c44b02 commit a24b021
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 32 deletions.
102 changes: 79 additions & 23 deletions llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
Expand Up @@ -40,10 +40,12 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
Expand Down Expand Up @@ -853,6 +855,11 @@ struct DSEState {
PostDominatorTree &PDT;
const TargetLibraryInfo &TLI;
const DataLayout &DL;
const LoopInfo &LI;

// Whether the function contains any irreducible control flow, useful for
// being accurately able to detect loops.
bool ContainsIrreducibleLoops;

// All MemoryDefs that potentially could kill other MemDefs.
SmallVector<MemoryDef *, 64> MemDefs;
Expand All @@ -876,14 +883,15 @@ struct DSEState {
DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;

DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
const LoopInfo &LI)
: F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
DL(F.getParent()->getDataLayout()) {}
DL(F.getParent()->getDataLayout()), LI(LI) {}

static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI) {
DSEState State(F, AA, MSSA, DT, PDT, TLI);
const TargetLibraryInfo &TLI, const LoopInfo &LI) {
DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
Expand Down Expand Up @@ -911,6 +919,9 @@ struct DSEState {
State.InvisibleToCallerAfterRet.insert({&AI, true});
}

// Collect whether there is any irreducible control flow in the function.
State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);

return State;
}

Expand All @@ -925,6 +936,12 @@ struct DSEState {
isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
const MemoryLocation &Later, const MemoryLocation &Earlier,
int64_t &EarlierOff, int64_t &LaterOff) {
// AliasAnalysis does not always account for loops. Limit overwrite checks
// to dependencies for which we can guarantee they are independant of any
// loops they are in.
if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
return OW_Unknown;

// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
// get imprecise values here, though (except for unknown sizes).
if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
Expand Down Expand Up @@ -1250,11 +1267,33 @@ struct DSEState {
return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
}

/// Returns true if a dependency between \p Current and \p KillingDef is
/// guaranteed to be loop invariant for the loops that they are in. Either
/// because they are known to be in the same block, in the same loop level or
/// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
/// during execution of the containing function.
bool isGuaranteedLoopIndependent(const Instruction *Current,
const Instruction *KillingDef,
const MemoryLocation &CurrentLoc) {
// If the dependency is within the same block or loop level (being careful
// of irreducible loops), we know that AA will return a valid result for the
// memory dependency. (Both at the function level, outside of any loop,
// would also be valid but we currently disable that to limit compile time).
if (Current->getParent() == KillingDef->getParent())
return true;
const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
if (!ContainsIrreducibleLoops && CurrentLI &&
CurrentLI == LI.getLoopFor(KillingDef->getParent()))
return true;
// Otherwise check the memory location is invariant to any loops.
return isGuaranteedLoopInvariant(CurrentLoc.Ptr);
}

/// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
/// loop. In particular, this guarantees that it only references a single
/// MemoryLocation during execution of the containing function.
bool IsGuaranteedLoopInvariant(Value *Ptr) {
auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
bool isGuaranteedLoopInvariant(const Value *Ptr) {
auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
Ptr = Ptr->stripPointerCasts();
if (auto *I = dyn_cast<Instruction>(Ptr)) {
if (isa<AllocaInst>(Ptr))
Expand Down Expand Up @@ -1398,9 +1437,9 @@ struct DSEState {

// AliasAnalysis does not account for loops. Limit elimination to
// candidates for which we can guarantee they always store to the same
// memory location and not multiple locations in a loop.
if (Current->getBlock() != KillingDef->getBlock() &&
!IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
// memory location and not located in different loops.
if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n");
StepAgain = true;
Current = CurrentDef->getDefiningAccess();
WalkerStepLimit -= 1;
Expand Down Expand Up @@ -1529,8 +1568,16 @@ struct DSEState {
return None;
}

// For the KillingDef and EarlierAccess we only have to check if it reads
// the memory location.
// If this worklist walks back to the original memory access (and the
// pointer is not guarenteed loop invariant) then we cannot assume that a
// store kills itself.
if (EarlierAccess == UseAccess &&
!isGuaranteedLoopInvariant(CurrentLoc->Ptr)) {
LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n");
return None;
}
// Otherwise, for the KillingDef and EarlierAccess we only have to check
// if it reads the memory location.
// TODO: It would probably be better to check for self-reads before
// calling the function.
if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
Expand All @@ -1550,16 +1597,18 @@ struct DSEState {
// stores [0,1]
if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
if (!isInvisibleToCallerAfterRet(DefUO) &&
UseAccess != EarlierAccess) {
BasicBlock *MaybeKillingBlock = UseInst->getParent();
if (PostOrderNumbers.find(MaybeKillingBlock)->second <
PostOrderNumbers.find(EarlierAccess->getBlock())->second) {

BasicBlock *MaybeKillingBlock = UseInst->getParent();
if (PostOrderNumbers.find(MaybeKillingBlock)->second <
PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
if (!isInvisibleToCallerAfterRet(DefUO)) {
LLVM_DEBUG(dbgs()
<< " ... found killing def " << *UseInst << "\n");
KillingDefs.insert(UseInst);
}
} else {
LLVM_DEBUG(dbgs()
<< " ... found preceeding def " << *UseInst << "\n");
return None;
}
} else
PushMemUses(UseDef);
Expand Down Expand Up @@ -1836,12 +1885,13 @@ struct DSEState {
}
};

bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI) {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
const LoopInfo &LI) {
bool MadeChange = false;

DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
Expand Down Expand Up @@ -2014,8 +2064,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);

bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);

#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
Expand All @@ -2029,6 +2080,7 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<MemorySSAAnalysis>();
PA.preserve<LoopAnalysis>();
return PA;
}

Expand All @@ -2054,8 +2106,9 @@ class DSELegacyPass : public FunctionPass {
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
PostDominatorTree &PDT =
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);

#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
Expand All @@ -2077,6 +2130,8 @@ class DSELegacyPass : public FunctionPass {
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<PostDominatorTreeWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
}
};

Expand All @@ -2093,6 +2148,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
false)

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
Expand Up @@ -517,8 +517,8 @@
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Memory SSA
; GCN-O2-NEXT: MemCpy Optimization
; GCN-O2-NEXT: Dead Store Elimination
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Dead Store Elimination
; GCN-O2-NEXT: Canonicalize natural loops
; GCN-O2-NEXT: LCSSA Verifier
; GCN-O2-NEXT: Loop-Closed SSA Form Pass
Expand Down Expand Up @@ -877,8 +877,8 @@
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Memory SSA
; GCN-O3-NEXT: MemCpy Optimization
; GCN-O3-NEXT: Dead Store Elimination
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Dead Store Elimination
; GCN-O3-NEXT: Canonicalize natural loops
; GCN-O3-NEXT: LCSSA Verifier
; GCN-O3-NEXT: Loop-Closed SSA Form Pass
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/opt-O2-pipeline.ll
Expand Up @@ -162,8 +162,8 @@
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Memory SSA
; CHECK-NEXT: MemCpy Optimization
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: LCSSA Verifier
; CHECK-NEXT: Loop-Closed SSA Form Pass
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
Expand Up @@ -167,8 +167,8 @@
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Memory SSA
; CHECK-NEXT: MemCpy Optimization
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: LCSSA Verifier
; CHECK-NEXT: Loop-Closed SSA Form Pass
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/opt-O3-pipeline.ll
Expand Up @@ -167,8 +167,8 @@
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Memory SSA
; CHECK-NEXT: MemCpy Optimization
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: LCSSA Verifier
; CHECK-NEXT: Loop-Closed SSA Form Pass
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/opt-Os-pipeline.ll
Expand Up @@ -148,8 +148,8 @@
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Memory SSA
; CHECK-NEXT: MemCpy Optimization
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Dead Store Elimination
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: LCSSA Verifier
; CHECK-NEXT: Loop-Closed SSA Form Pass
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
Expand Up @@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias
; CHECK: for.body4.lr.ph:
; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
; CHECK-NEXT: br label [[FOR_BODY4:%.*]]
; CHECK: for.body4:
Expand Down Expand Up @@ -179,7 +178,6 @@ define i32 @test_if(i1 %c, i32* %p, i32 %i) {
; CHECK-NEXT: [[PH:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BB3:%.*]] ]
; CHECK-NEXT: [[INC]] = add i32 [[PH]], 1
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[PH]]
; CHECK-NEXT: store i32 [[I:%.*]], i32* [[GEP]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3]]
Expand Down Expand Up @@ -216,7 +214,6 @@ define i32 @test_if2(i1 %c, i32* %p, i32 %i) {
; CHECK-NEXT: [[PH:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BB2:%.*]] ], [ [[INC]], [[BB3:%.*]] ]
; CHECK-NEXT: [[INC]] = add i32 [[PH]], 1
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[PH]]
; CHECK-NEXT: store i32 [[I:%.*]], i32* [[GEP]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[BB2]], label [[BB3]]
; CHECK: bb2:
; CHECK-NEXT: store i32 2, i32* [[GEP]], align 4
Expand Down

0 comments on commit a24b021

Please sign in to comment.