Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1)
Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059
- Loading branch information
1 parent
2b9b9c8
commit b1e8714
Showing
10 changed files
with
299 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// | ||
// | ||
// The LLVM Compiler Infrastructure | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// For Falkor, we want to avoid HW prefetcher instruction tag collisions that | ||
// may inhibit the HW prefetching. This is done in two steps. Before ISel, we | ||
// mark strided loads (i.e. those that will likely benefit from prefetching) | ||
// with metadata. Then, after opcodes have been finalized, we insert MOVs and | ||
// re-write loads to prevent unintnentional tag collisions. | ||
// ===----------------------------------------------------------------------===// | ||
|
||
#include "AArch64.h" | ||
#include "AArch64InstrInfo.h" | ||
#include "AArch64TargetMachine.h" | ||
#include "llvm/ADT/DepthFirstIterator.h" | ||
#include "llvm/ADT/Statistic.h" | ||
#include "llvm/Analysis/LoopInfo.h" | ||
#include "llvm/Analysis/ScalarEvolution.h" | ||
#include "llvm/Analysis/ScalarEvolutionExpressions.h" | ||
#include "llvm/CodeGen/TargetPassConfig.h" | ||
#include "llvm/IR/Dominators.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/IR/Module.h" | ||
#include "llvm/Support/CommandLine.h" | ||
#include "llvm/Support/Debug.h" | ||
|
||
using namespace llvm; | ||
|
||
#define DEBUG_TYPE "falkor-hwpf-fix" | ||
|
||
STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); | ||
|
||
namespace { | ||
|
||
class FalkorMarkStridedAccesses { | ||
public: | ||
FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) | ||
: LI(LI), SE(SE) {} | ||
|
||
bool run(); | ||
|
||
private: | ||
bool runOnLoop(Loop *L); | ||
|
||
LoopInfo &LI; | ||
ScalarEvolution &SE; | ||
}; | ||
|
||
class FalkorMarkStridedAccessesLegacy : public FunctionPass { | ||
public: | ||
static char ID; // Pass ID, replacement for typeid | ||
FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { | ||
initializeFalkorMarkStridedAccessesLegacyPass( | ||
*PassRegistry::getPassRegistry()); | ||
} | ||
|
||
void getAnalysisUsage(AnalysisUsage &AU) const override { | ||
AU.addRequired<TargetPassConfig>(); | ||
AU.addPreserved<DominatorTreeWrapperPass>(); | ||
AU.addRequired<LoopInfoWrapperPass>(); | ||
AU.addPreserved<LoopInfoWrapperPass>(); | ||
AU.addRequired<ScalarEvolutionWrapperPass>(); | ||
// FIXME: For some reason, preserving SE here breaks LSR (even if | ||
// this pass changes nothing). | ||
// AU.addPreserved<ScalarEvolutionWrapperPass>(); | ||
} | ||
|
||
bool runOnFunction(Function &F) override; | ||
}; | ||
} // namespace | ||
|
||
char FalkorMarkStridedAccessesLegacy::ID = 0; | ||
INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, | ||
"Falkor HW Prefetch Fix", false, false) | ||
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) | ||
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) | ||
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) | ||
INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, | ||
"Falkor HW Prefetch Fix", false, false) | ||
|
||
FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { | ||
return new FalkorMarkStridedAccessesLegacy(); | ||
} | ||
|
||
bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { | ||
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); | ||
const AArch64Subtarget *ST = | ||
TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F); | ||
if (ST->getProcFamily() != AArch64Subtarget::Falkor) | ||
return false; | ||
|
||
if (skipFunction(F)) | ||
return false; | ||
|
||
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); | ||
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); | ||
|
||
FalkorMarkStridedAccesses LDP(LI, SE); | ||
return LDP.run(); | ||
} | ||
|
||
bool FalkorMarkStridedAccesses::run() { | ||
bool MadeChange = false; | ||
|
||
for (Loop *I : LI) | ||
for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) | ||
MadeChange |= runOnLoop(*L); | ||
|
||
return MadeChange; | ||
} | ||
|
||
bool FalkorMarkStridedAccesses::runOnLoop(Loop *L) { | ||
// Only mark strided loads in the inner-most loop | ||
if (!L->empty()) | ||
return false; | ||
|
||
bool MadeChange = false; | ||
|
||
for (const auto BB : L->blocks()) { | ||
for (auto &I : *BB) { | ||
LoadInst *LoadI = dyn_cast<LoadInst>(&I); | ||
if (!LoadI) | ||
continue; | ||
|
||
Value *PtrValue = LoadI->getPointerOperand(); | ||
if (L->isLoopInvariant(PtrValue)) | ||
continue; | ||
|
||
const SCEV *LSCEV = SE.getSCEV(PtrValue); | ||
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); | ||
if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) | ||
continue; | ||
|
||
LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, | ||
MDNode::get(LoadI->getContext(), {})); | ||
++NumStridedLoadsMarked; | ||
DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); | ||
MadeChange = true; | ||
} | ||
} | ||
|
||
return MadeChange; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s | ||
; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF | ||
|
||
; Check that strided access metadata is added to loads in inner loops when compiling for Falkor. | ||
|
||
; CHECK-LABEL: @hwpf1( | ||
; CHECK: load i32, i32* %gep, !falkor.strided.access !0 | ||
; CHECK: load i32, i32* %gep2, !falkor.strided.access !0 | ||
|
||
; NOHWPF-LABEL: @hwpf1( | ||
; NOHWPF: load i32, i32* %gep{{$}} | ||
; NOHWPF: load i32, i32* %gep2{{$}} | ||
define void @hwpf1(i32* %p, i32* %p2) { | ||
entry: | ||
br label %loop | ||
|
||
loop: | ||
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ] | ||
|
||
%gep = getelementptr inbounds i32, i32* %p, i32 %iv | ||
%load = load i32, i32* %gep | ||
|
||
%gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv | ||
%load2 = load i32, i32* %gep2 | ||
|
||
%inc = add i32 %iv, 1 | ||
%exitcnd = icmp uge i32 %inc, 1024 | ||
br i1 %exitcnd, label %exit, label %loop | ||
|
||
exit: | ||
ret void | ||
} | ||
|
||
; Check that outer loop strided load isn't marked. | ||
; CHECK-LABEL: @hwpf2( | ||
; CHECK: load i32, i32* %gep, !falkor.strided.access !0 | ||
; CHECK: load i32, i32* %gep2{{$}} | ||
|
||
; NOHWPF-LABEL: @hwpf2( | ||
; NOHWPF: load i32, i32* %gep{{$}} | ||
; NOHWPF: load i32, i32* %gep2{{$}} | ||
define void @hwpf2(i32* %p) { | ||
entry: | ||
br label %loop1 | ||
|
||
loop1: | ||
%iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] | ||
%outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ] | ||
br label %loop2.header | ||
|
||
loop2.header: | ||
br label %loop2 | ||
|
||
loop2: | ||
%iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] | ||
%sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ] | ||
%gep = getelementptr inbounds i32, i32* %p, i32 %iv2 | ||
%load = load i32, i32* %gep | ||
%sum.inc = add i32 %sum, %load | ||
%inc2 = add i32 %iv2, 1 | ||
%exitcnd2 = icmp uge i32 %inc2, 1024 | ||
br i1 %exitcnd2, label %exit2, label %loop2 | ||
|
||
exit2: | ||
%gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1 | ||
%load2 = load i32, i32* %gep2 | ||
br label %loop1.latch | ||
|
||
loop1.latch: | ||
%inc1 = add i32 %iv1, 1 | ||
%exitcnd1 = icmp uge i32 %inc1, 1024 | ||
br i1 %exitcnd2, label %exit, label %loop1 | ||
|
||
exit: | ||
ret void | ||
} | ||
|
||
|
||
; Check that non-strided load isn't marked. | ||
; CHECK-LABEL: @hwpf3( | ||
; CHECK: load i32, i32* %gep, !falkor.strided.access !0 | ||
; CHECK: load i32, i32* %gep2{{$}} | ||
|
||
; NOHWPF-LABEL: @hwpf3( | ||
; NOHWPF: load i32, i32* %gep{{$}} | ||
; NOHWPF: load i32, i32* %gep2{{$}} | ||
define void @hwpf3(i32* %p, i32* %p2) { | ||
entry: | ||
br label %loop | ||
|
||
loop: | ||
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ] | ||
|
||
%gep = getelementptr inbounds i32, i32* %p, i32 %iv | ||
%load = load i32, i32* %gep | ||
|
||
%gep2 = getelementptr inbounds i32, i32* %p2, i32 %load | ||
%load2 = load i32, i32* %gep2 | ||
|
||
%inc = add i32 %iv, 1 | ||
%exitcnd = icmp uge i32 %inc, 1024 | ||
br i1 %exitcnd, label %exit, label %loop | ||
|
||
exit: | ||
ret void | ||
} |
Oops, something went wrong.